Skip to content

Commit

Permalink
pg17 upgrade (#87)
Browse files Browse the repository at this point in the history
* remove gcr creds and use public image

* fix lantern extras installation instructions

* keep pg_version information in lantern_resource

* add pg_upgrade script

* fix pg_upgrade script, add dns failover mechanism and set it as default

* improve dns switchover

* reduce wait time when stopping container on switchover

* correctly drop replication slot and publications after replica destroyed

* wait for logical replication lag to be 0 before doing switchover

* setup ssl on fork to not waste time on switchover

* fix ssl cert check issue

* fixes and improvements on adding domain and ssl setup

* add rollback switchover functionality in misc operations
  • Loading branch information
var77 authored Nov 13, 2024
1 parent bde6f3b commit 5971696
Show file tree
Hide file tree
Showing 21 changed files with 947 additions and 237 deletions.
4 changes: 2 additions & 2 deletions config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,12 @@ def self.e2e_test?
# GCP
override :gcp_project_id, "lantern-development", string
override :gcp_compute_service_account, "339254316100-compute@developer.gserviceaccount.com", string
optional :gcp_creds_gcr_b64, string
optional :gcp_creds_logging_b64, string
optional :gcp_creds_coredumps_b64, string
optional :gcp_creds_walg_b64, string
optional :prom_password, string
override :gcp_default_image, "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20240319", string
override :gcr_image, "gcr.io/ringed-griffin-394922/lantern-bitnami"
override :gcr_image, "lanterndata/lantern-self-hosted"

# Lantern
override :lantern_top_domain, "db.lantern.dev", string
Expand All @@ -153,6 +152,7 @@ def self.e2e_test?
override :lantern_backup_bucket, "walg-dev-backups"
override :e2e_test, "0"
override :backup_retention_days, 7, int
override :backup_retention_days_after_deletion, 0, int
override :lantern_log_dataset, "lantern_logs", string
override :compose_file, "/var/lib/lantern/docker-compose.yaml", string

Expand Down
10 changes: 10 additions & 0 deletions migrate/20241107_lantern_resource_pg_version.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# frozen_string_literal: true

Sequel.migration do
change do
alter_table(:lantern_resource) do
add_column :pg_version, Integer, default: 17
end
run "UPDATE lantern_resource SET pg_version=15"
end
end
26 changes: 25 additions & 1 deletion misc/misc_operations.rb
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ def self.create_image(lantern_version: "0.2.7", extras_version: "0.1.5", minor_v
rm -rf /tmp/get-docker.sh
sudo sed -i 's/ulimit -Hn/ulimit -n/' /etc/init.d/docker
sudo service docker restart
echo #{Config.gcp_creds_gcr_b64} | base64 -d | sudo docker login -u _json_key --password-stdin https://gcr.io
sudo docker pull #{container_image}
sudo docker logout
history -cw
Expand All @@ -241,4 +240,29 @@ def self.create_image(lantern_version: "0.2.7", extras_version: "0.1.5", minor_v
puts "Image created"
vm.incr_destroy
end

def self.rollback_switchover(current_resource, old_resource)
# stop current one and start old one
begin
current_resource.representative_server.stop_container(1)
rescue
end

old_resource.representative_server.start_container

# update dns
cf_client = Dns::Cloudflare.new
cf_client.upsert_dns_record(current_resource.representative_server.domain, old_resource.representative_server.vm.sshable.host)
old_resource.representative_server.update(domain: current_resource.representative_server.domain)
current_resource.representative_server.update(domain: nil)

# disable readonly as soon as it is started
loop do
old_resource.representative_server.run_query("SELECT 1")
old_resource.set_to_readonly(status: "off")
break
rescue
sleep 10
end
end
end
45 changes: 36 additions & 9 deletions model/lantern/lantern_resource.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true

require "uri"
require_relative "../../model"

class LanternResource < Sequel::Model
Expand All @@ -21,7 +22,7 @@ class LanternResource < Sequel::Model
include Authorization::HyperTagMethods
include Authorization::TaggableMethods

semaphore :destroy, :swap_leaders_with_parent
semaphore :destroy, :swap_leaders_with_parent, :switchover_with_parent

plugin :column_encryption do |enc|
enc.column :superuser_password
Expand Down Expand Up @@ -74,8 +75,13 @@ def dissociate_forks
def setup_service_account
api = Hosting::GcpApis.new
service_account = api.create_service_account("lt-#{ubid}", "Service Account for Lantern #{name}")
key = api.export_service_account_key(service_account["email"])
update(gcp_creds_b64: key, service_account_name: service_account["email"])
update(service_account_name: service_account["email"])
end

def export_service_account_key
api = Hosting::GcpApis.new
key = api.export_service_account_key(service_account_name)
update(gcp_creds_b64: key)
end

def allow_timeline_access_to_bucket
Expand Down Expand Up @@ -104,6 +110,10 @@ def delete_replication_slot(name)
representative_server.run_query("SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name='#{name}';")
end

def get_logical_replication_lag(slot_name)
representative_server.run_query("SELECT (pg_current_wal_lsn() - confirmed_flush_lsn) FROM pg_catalog.pg_replication_slots WHERE slot_name = '#{slot_name}'").chomp.to_i
end

def create_ddl_log
commands = <<SQL
BEGIN;
Expand All @@ -130,6 +140,13 @@ def create_ddl_log
representative_server.run_query_all(commands)
end

def drop_ddl_log_trigger
commands = <<SQL
DROP EVENT TRIGGER IF EXISTS log_ddl_trigger;
SQL
representative_server.run_query_all(commands)
end

def listen_ddl_log
commands = <<SQL
DROP EVENT TRIGGER IF EXISTS log_ddl_trigger;
Expand All @@ -143,6 +160,7 @@ def listen_ddl_log
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS execute_ddl_after_insert ON ddl_log;
CREATE TRIGGER execute_ddl_after_insert
AFTER INSERT ON ddl_log
FOR EACH ROW
Expand All @@ -156,6 +174,10 @@ def create_publication(name)
representative_server.run_query_all("CREATE PUBLICATION #{name} FOR ALL TABLES")
end

def delete_publication(name)
representative_server.run_query_all("DROP PUBLICATION IF EXISTS #{name}")
end

def sync_sequences_with_parent
representative_server.list_all_databases.each do |db|
res = parent.representative_server.run_query("
Expand All @@ -171,15 +193,18 @@ def sync_sequences_with_parent
"SELECT setval('#{values[0]}.#{values[1]}', #{values[2]});"
end

representative_server.run_query(statements, db: db)
representative_server.run_query(statements.join("\n"), db: db)
end
end

def create_and_enable_subscription
representative_server.list_all_databases.each do |db|
uri = URI.parse(parent.connection_string(port: 5432))
new_query_ar = URI.decode_www_form(String(uri.query)) << ["dbname", db]
uri.query = URI.encode_www_form(new_query_ar)
commands = <<SQL
CREATE SUBSCRIPTION sub_#{ubid}
CONNECTION '#{parent.connection_string(port: 5432)}/#{db}'
CONNECTION '#{uri}'
PUBLICATION pub_#{ubid}
WITH (
copy_data = false,
Expand All @@ -195,11 +220,11 @@ def create_and_enable_subscription
end
end

def disable_logical_subscription
representative_server.run_query_all("ALTER SUBSCRIPTION sub_#{ubid} DISABLE")
def delete_logical_subscription(name)
representative_server.run_query_all("DROP SUBSCRIPTION IF EXISTS #{name}")
end

def create_logical_replica(lantern_version: nil, extras_version: nil, minor_version: nil)
def create_logical_replica(lantern_version: nil, extras_version: nil, minor_version: nil, pg_upgrade: nil)
# TODO::
# 1. If new database will be created during logical replication it won't be added automatically
# 2. New timeline will be generated for lantern resource
Expand All @@ -224,7 +249,9 @@ def create_logical_replica(lantern_version: nil, extras_version: nil, minor_vers
logical_replication: true,
lantern_version: lantern_version || representative_server.lantern_version,
extras_version: extras_version || representative_server.extras_version,
minor_version: minor_version || representative_server.minor_version
minor_version: minor_version || representative_server.minor_version,
pg_version: pg_version,
pg_upgrade: pg_upgrade
)
end

Expand Down
61 changes: 53 additions & 8 deletions model/lantern/lantern_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def connection_string(port: 6432)
end

def run_query(query, db: "postgres", user: "postgres")
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec -T postgresql psql -q -U #{user} -t --csv #{db}", stdin: query).chomp
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec -T postgresql psql -q -U #{user} -t --csv -v ON_ERROR_STOP=1 #{db}", stdin: query).chomp
end

def run_query_all(query)
Expand Down Expand Up @@ -99,6 +99,10 @@ def instance_type
standby? ? "reader" : "writer"
end

def container_image(p_lantern_version = lantern_version, p_extras_version = extras_version, p_minor_version = minor_version)
"#{Config.gcr_image}:lantern-#{p_lantern_version}-extras-#{p_extras_version}-minor-#{p_minor_version}"
end

def configure_hash
walg_config = timeline.generate_walg_config
backup_label = ""
Expand Down Expand Up @@ -140,17 +144,17 @@ def configure_hash
master_host: resource.representative_server.hostname,
master_port: 5432,
prom_password: Config.prom_password,
gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64,
gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64,
gcp_creds_logging_b64: Config.gcp_creds_logging_b64,
container_image: "#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}",
container_image: container_image,
postgresql_recover_from_backup: backup_label,
postgresql_recovery_target_time: postgresql_recovery_target_time,
postgresql_recovery_target_lsn: postgresql_recovery_target_lsn,
gcp_creds_walg_b64: walg_config[:gcp_creds_b64],
walg_gs_prefix: walg_config[:walg_gs_prefix],
gcp_creds_big_query_b64: resource.gcp_creds_b64,
big_query_dataset: Config.lantern_log_dataset
big_query_dataset: Config.lantern_log_dataset,
pg_version: resource.pg_version
})
end

Expand All @@ -174,10 +178,6 @@ def update_walg_creds
]))
end

def container_image
"#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}"
end

def init_health_monitor_session
if strand.label != "wait"
fail "server is not ready to initialize session"
Expand Down Expand Up @@ -257,6 +257,51 @@ def autoresize_disk
incr_update_storage_size
end

def destroy_domain
cf_client = Dns::Cloudflare.new
cf_client.delete_dns_record(domain)
end

def add_domain_to_stack(domain, p_strand = strand)
current_frame = p_strand.stack.first
current_frame["domain"] = domain
p_strand.modified!(:stack)
p_strand.save_changes
end

def remove_domain_from_stack(p_strand = strand)
current_frame = p_strand.stack.first
current_frame.delete("domain")
p_strand.modified!(:stack)
p_strand.save_changes
end

def swap_dns(other_server)
strand.stack.first["domain"] = other_server.domain
strand.modified!(:stack)
strand.save_changes
other_server.update(domain: nil)

if domain
destroy_domain
update(domain: nil)
end

incr_add_domain
end

def is_dns_correct?
domain && Resolv.getaddress(domain) == vm.sshable.host
end

def stop_container(timeout = 60)
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} down -t #{timeout} || true")
end

def start_container
vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} up -d")
end

# def failover_target
# nil
# end
Expand Down
Loading

0 comments on commit 5971696

Please sign in to comment.