Skip to content

Commit

Permalink
fix pg_upgrade script, add dns failover mechanism and set it as default
Browse files Browse the repository at this point in the history
  • Loading branch information
var77 committed Nov 8, 2024
1 parent b578e79 commit 4fc60f9
Show file tree
Hide file tree
Showing 13 changed files with 342 additions and 52 deletions.
1 change: 1 addition & 0 deletions config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def self.e2e_test?
override :lantern_backup_bucket, "walg-dev-backups"
override :e2e_test, "0"
override :backup_retention_days, 7, int
override :backup_retention_days_after_deletion, 0, int
override :lantern_log_dataset, "lantern_logs", string
override :compose_file, "/var/lib/lantern/docker-compose.yaml", string

Expand Down
18 changes: 15 additions & 3 deletions model/lantern/lantern_resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class LanternResource < Sequel::Model
include Authorization::HyperTagMethods
include Authorization::TaggableMethods

semaphore :destroy, :swap_leaders_with_parent
semaphore :destroy, :swap_leaders_with_parent, :switchover_with_parent

plugin :column_encryption do |enc|
enc.column :superuser_password
Expand Down Expand Up @@ -74,8 +74,13 @@ def dissociate_forks
def setup_service_account
api = Hosting::GcpApis.new
service_account = api.create_service_account("lt-#{ubid}", "Service Account for Lantern #{name}")
key = api.export_service_account_key(service_account["email"])
update(gcp_creds_b64: key, service_account_name: service_account["email"])
update(service_account_name: service_account["email"])
end

def export_service_account_key
api = Hosting::GcpApis.new
key = api.export_service_account_key(service_account_name)
update(gcp_creds_b64: key)
end

def allow_timeline_access_to_bucket
Expand Down Expand Up @@ -130,6 +135,13 @@ def create_ddl_log
representative_server.run_query_all(commands)
end

def drop_ddl_log_trigger
commands = <<SQL
DROP EVENT TRIGGER IF EXISTS log_ddl_trigger;
SQL
representative_server.run_query_all(commands)
end

def listen_ddl_log
commands = <<SQL
DROP EVENT TRIGGER IF EXISTS log_ddl_trigger;
Expand Down
26 changes: 23 additions & 3 deletions model/lantern/lantern_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def connection_string(port: 6432)
end

def run_query(query, db: "postgres", user: "postgres")
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec -T postgresql psql -q -U #{user} -t --csv #{db}", stdin: query).chomp
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec -T postgresql psql -q -U #{user} -t --csv -v ON_ERROR_STOP=1 #{db}", stdin: query).chomp
end

def run_query_all(query)
Expand Down Expand Up @@ -99,8 +99,8 @@ def instance_type
standby? ? "reader" : "writer"
end

def container_image
"#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}"
def container_image(p_lantern_version = lantern_version, p_extras_version = extras_version, p_minor_version = minor_version)
"#{Config.gcr_image}:lantern-#{p_lantern_version}-extras-#{p_extras_version}-minor-#{p_minor_version}"
end

def configure_hash
Expand Down Expand Up @@ -257,6 +257,26 @@ def autoresize_disk
incr_update_storage_size
end

def swap_dns(other_server)
strand.stack.first["domain"] = other_server.domain
strand.modified!(:stack)
strand.save_changes
other_server.update(domain: nil)
incr_add_domain
end

def is_dns_correct?
Resolv.getaddress(domain) == vm.sshable.host
end

def stop_container
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} down -t 60 || true")
end

def start_container
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} up -d")
end

# def failover_target
# nil
# end
Expand Down
59 changes: 57 additions & 2 deletions prog/lantern/lantern_resource_nexus.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# frozen_string_literal: true

require "forwardable"
require "resolv"

class Prog::Lantern::LanternResourceNexus < Prog::Base
subject_is :lantern_resource

extend Forwardable
def_delegators :lantern_resource, :servers, :representative_server

semaphore :destroy, :swap_leaders_with_parent
semaphore :destroy, :swap_leaders_with_parent, :switchover_with_parent

def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage_size_gib:, ubid: LanternResource.generate_ubid, ha_type: LanternResource::HaType::NONE, parent_id: nil, restore_target: nil, recovery_target_lsn: nil,
org_id: nil, db_name: "postgres", db_user: "postgres", db_user_password: nil, superuser_password: nil, repl_password: nil, app_env: Config.rack_env,
Expand Down Expand Up @@ -126,10 +127,22 @@ def before_run
end
end

label def start
label def setup_service_account
lantern_resource.setup_service_account
hop_export_service_account_key
end

label def export_service_account_key
lantern_resource.export_service_account_key
hop_create_logging_table
end

label def create_logging_table
lantern_resource.create_logging_table
hop_setup_timeline_access
end

label def setup_timeline_access
if lantern_resource.parent_id.nil?
lantern_resource.allow_timeline_access_to_bucket
register_deadline(:wait, 10 * 60)
Expand All @@ -140,6 +153,10 @@ def before_run
hop_wait_servers
end

label def start
hop_setup_service_account
end

# TODO:: check why is this needed
# label def trigger_pg_current_xact_id_on_parent
# lantern_resource.parent.representative_server.run_query("SELECT pg_current_xact_id()")
Expand Down Expand Up @@ -199,6 +216,17 @@ def before_run
end
end

when_switchover_with_parent_set? do
if lantern_resource.parent.nil?
decr_switchover_with_parent
else
lantern_resource.update(display_state: "failover")
lantern_resource.parent.update(display_state: "failover")
register_deadline(:wait, 10 * 60)
hop_switchover_with_parent
end
end

nap 30
end

Expand Down Expand Up @@ -244,6 +272,33 @@ def before_run
hop_wait_swap_ip
end

label def switchover_with_parent
decr_switchover_with_parent
lantern_resource.parent.set_to_readonly
hop_disable_logical_subscription
end

label def disable_logical_subscription
lantern_resource.disable_logical_subscription
hop_sync_sequences_with_parent
end

label def sync_sequences_with_parent
lantern_resource.sync_sequences_with_parent
hop_switch_dns_with_parent
end

label def switch_dns_with_parent
lantern_resource.parent.representative_server.stop_container

if lantern_resource.parent.representative_server.domain.nil?
hop_wait_servers
end

lantern_resource.representative_server.swap_dns(lantern_resource.parent.representative_server)
hop_wait_servers
end

label def destroy
register_deadline(nil, 5 * 60)

Expand Down
50 changes: 34 additions & 16 deletions prog/lantern/lantern_server_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -236,42 +236,43 @@ def before_run

label def run_pg_upgrade
decr_run_pg_upgrade
current_frame = strand.stack.first
resource = lantern_server.resource
pg_upgrade_info = current_frame["pg_upgrade"]
# prepare files
lantern_server.update(
lantern_version: pg_upgrade_info["lantern_version"],
extras_version: pg_upgrade_info["extras_version"],
minor_version: pg_upgrade_info["minor_version"]
)
lantern_server.resource.drop_ddl_log_trigger
pg_upgrade_info = strand.stack.first["pg_upgrade"]
vm.sshable.cmd(
"common/bin/daemonizer 'sudo lantern/bin/run_pg_upgrade' pg_upgrade",
stdin: JSON.generate({
container_image: lantern_server.container_image,
old_pg_version: resource.pg_version
container_image: lantern_server.container_image(
pg_upgrade_info["lantern_version"],
pg_upgrade_info["extras_version"],
pg_upgrade_info["minor_version"]
),
pg_version: pg_upgrade_info["pg_version"],
old_pg_version: lantern_server.resource.pg_version
})
)
resource.update(pg_version: pg_upgrade_info["pg_version"])
# run scripts
hop_wait_pg_upgrade
end

label def wait_pg_upgrade
current_frame = strand.stack.first
case vm.sshable.cmd("common/bin/daemonizer --check pg_upgrade")
when "Succeeded"
pg_upgrade_info = current_frame["pg_upgrade"]
lantern_server.resource.update(pg_version: pg_upgrade_info["pg_version"])
lantern_server.update(
lantern_version: pg_upgrade_info["lantern_version"],
extras_version: pg_upgrade_info["extras_version"],
minor_version: pg_upgrade_info["minor_version"]
)
current_frame.delete("pg_upgrade")
strand.modified!(:stack)
strand.save_changes
vm.sshable.cmd("common/bin/daemonizer --clean pg_upgrade")
register_deadline(:wait, 40 * 60)
hop_init_sql
when "Failed"
logs = JSON.parse(vm.sshable.cmd("common/bin/daemonizer --logs pg_upgrade"))
Clog.emit("Postgres upgrade failed") { {logs: logs, name: lantern_server.resource.name, lantern_server: lantern_server.id} }
Prog::PageNexus.assemble_with_logs("Postgres update failed on #{lantern_server.resource.name} (#{lantern_server.resource.label})", [lantern_server.resource.ubid, lantern_server.ubid], logs, "critical", "LanternPGUpgradeFailed", lantern_server.ubid)
vm.sshable.cmd("common/bin/daemonizer --clean pg_upgrade")
hop_wait
end
nap 10
Expand Down Expand Up @@ -579,6 +580,18 @@ def remove_domain_from_stack
hop_promote_server
end

label def wait_swap_dns
# wait until ip change will propogate
begin
nap 5 if !lantern_server.is_dns_correct?
lantern_server.run_query("SELECT 1")
rescue
nap 5
end

hop_promote_server
end

label def take_over
decr_take_over
if !lantern_server.standby?
Expand All @@ -589,7 +602,12 @@ def remove_domain_from_stack
# put the old server in container_stopped mode, so no healthcheck will be done
lantern_server.resource.representative_server.incr_container_stopped

hop_swap_ip
hop_swap_dns
end

label def swap_dns
lantern_server.swap_dns(lantern_server.resource.representative_server)
hop_wait
end

label def swap_ip
Expand Down
2 changes: 1 addition & 1 deletion prog/lantern/lantern_timeline_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def before_run
label def destroy
when_destroy_set? do
decr_destroy
nap 60 * 60 * 24 * 30 # 30 days
nap Config.backup_retention_days_after_deletion * 60 * 60 * 24
end

destroy_blob_storage
Expand Down
5 changes: 4 additions & 1 deletion rhizome/lantern/bin/run_pg_upgrade
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ data = YAML.load_file $compose_file

container_image = $configure_hash["container_image"]
old_pg_version = $configure_hash["old_pg_version"]
pg_version = $configure_hash["pg_version"]
current_container_image = data["services"]["postgresql"]["image"]

r "sudo docker compose -f #{$compose_file} down -t 10"
Expand All @@ -30,12 +31,14 @@ append_env([

data["services"]["postgresql"]["image"] = container_image
data["services"]["postgresql"]["user"] = "root"
data["services"]["postgresql"]["deploy"].delete("restart_policy")
File.open($compose_file, "w") { |f| YAML.dump(data, f) }

r "sudo docker compose -f #{$compose_file} up"

data = YAML.load_file $compose_file
data["services"]["postgresql"].delete("user")
data["services"]["postgresql"]["deploy"]["restart_policy"] = {"condition" => "on-failure"}
r "sudo rm -rf #{$datadir}/old-lib-#{old_pg_version} #{$datadir}/old-bin-#{old_pg_version} #{$datadir}/old-share-#{old_pg_version}"
r "sudo chown -R 1001:1001 #{$datadir}"
File.open($compose_file, "w") { |f| YAML.dump(data, f) }
Expand All @@ -44,4 +47,4 @@ append_env([
["POSTGRESQL_RUN_PGUPGRADE", "no"]
])

r "sudo docker compose -f #{$compose_file} up -d"
run_database(container_image, pg_version)
2 changes: 1 addition & 1 deletion rhizome/lantern/lib/common.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def run_database(container_image, pg_version)
File.open($compose_file, "w") { |f| YAML.dump(data, f) }
r "sudo docker rm -f tc 2>/dev/null || true"
r "sudo docker create --name tc #{container_image}"
r "sudo docker cp tc:/opt/bitnami/postgresql #{$pg_mount_path}"
r "sudo docker cp tc:/usr/lib/postgresql/#{pg_version} #{$pg_mount_path}"
r "sudo docker rm tc"
r "sudo chown -R 1001:1001 #{$pg_mount_path}"
# Mount extension dir, so we can make automatic updates from host
Expand Down
23 changes: 21 additions & 2 deletions spec/model/lantern/lantern_resource_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,22 @@
it "sets up service account and updates resource" do
api = instance_double(Hosting::GcpApis)
allow(Hosting::GcpApis).to receive(:new).and_return(api)
allow(api).to receive_messages(create_service_account: {"email" => "test-sa"}, export_service_account_key: "test-key")
expect(lantern_resource).to receive(:update).with(gcp_creds_b64: "test-key", service_account_name: "test-sa")
allow(api).to receive_messages(create_service_account: {"email" => "test-sa"})
expect(lantern_resource).to receive(:update).with(service_account_name: "test-sa")
expect { lantern_resource.setup_service_account }.not_to raise_error
end
end

describe "#export_service_account_key" do
it "exports service account key and updates resource" do
api = instance_double(Hosting::GcpApis)
allow(Hosting::GcpApis).to receive(:new).and_return(api)
allow(api).to receive_messages(export_service_account_key: "test-key")
expect(lantern_resource).to receive(:update).with(gcp_creds_b64: "test-key")
expect { lantern_resource.export_service_account_key }.not_to raise_error
end
end

describe "#create_logging_table" do
it "create bigquery table and gives access" do
instance_double(LanternTimeline, ubid: "test")
Expand Down Expand Up @@ -144,6 +154,15 @@
end
end

describe "#drop_ddl_log_trigger" do
it "drops ddl log trigger" do
representative_server = instance_double(LanternServer)
expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once)
expect(lantern_resource.representative_server).to receive(:run_query_all).with(a_string_matching(/DROP .* log_ddl_trigger/))
expect { lantern_resource.drop_ddl_log_trigger }.not_to raise_error
end
end

describe "#listen_ddl_log" do
it "listends ddl log table" do
representative_server = instance_double(LanternServer)
Expand Down
Loading

0 comments on commit 4fc60f9

Please sign in to comment.