From 3f334eb315ed4d82a2a331578fb20961674b599e Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 5 Nov 2024 16:31:22 +0400 Subject: [PATCH 01/13] remove gcr creds and use public image --- config.rb | 3 +-- misc/misc_operations.rb | 1 - model/lantern/lantern_server.rb | 1 - prog/lantern/lantern_server_nexus.rb | 5 ----- rhizome/lantern/bin/configure | 2 +- rhizome/lantern/bin/update_docker_image | 3 +-- rhizome/lantern/lib/common.rb | 3 +-- spec/model/lantern/lantern_server_spec.rb | 8 -------- spec/prog/lantern/lantern_server_nexus_spec.rb | 13 ------------- 9 files changed, 4 insertions(+), 35 deletions(-) diff --git a/config.rb b/config.rb index 099a23d5c..1ce786887 100644 --- a/config.rb +++ b/config.rb @@ -136,13 +136,12 @@ def self.e2e_test? # GCP override :gcp_project_id, "lantern-development", string override :gcp_compute_service_account, "339254316100-compute@developer.gserviceaccount.com", string - optional :gcp_creds_gcr_b64, string optional :gcp_creds_logging_b64, string optional :gcp_creds_coredumps_b64, string optional :gcp_creds_walg_b64, string optional :prom_password, string override :gcp_default_image, "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20240319", string - override :gcr_image, "gcr.io/ringed-griffin-394922/lantern-bitnami" + override :gcr_image, "lanterndata/lantern-self-hosted" # Lantern override :lantern_top_domain, "db.lantern.dev", string diff --git a/misc/misc_operations.rb b/misc/misc_operations.rb index 0891ef886..5b17ef44c 100644 --- a/misc/misc_operations.rb +++ b/misc/misc_operations.rb @@ -222,7 +222,6 @@ def self.create_image(lantern_version: "0.2.7", extras_version: "0.1.5", minor_v rm -rf /tmp/get-docker.sh sudo sed -i 's/ulimit -Hn/ulimit -n/' /etc/init.d/docker sudo service docker restart -echo #{Config.gcp_creds_gcr_b64} | base64 -d | sudo docker login -u _json_key --password-stdin https://gcr.io sudo docker pull #{container_image} sudo docker logout history -cw diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index 2280ca240..0ca7cd434 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -140,7 +140,6 @@ def configure_hash master_host: resource.representative_server.hostname, master_port: 5432, prom_password: Config.prom_password, - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64, gcp_creds_logging_b64: Config.gcp_creds_logging_b64, container_image: "#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}", diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index ca1a7b573..7a38920de 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -119,10 +119,6 @@ def before_run end label def setup_docker_stack - if !Config.gcp_creds_gcr_b64 - raise "GCP_CREDS_GCR_B64 is required to setup docker stack for Lantern" - end - # wait for service account to be created nap 10 if lantern_server.timeline.strand.label == "start" @@ -307,7 +303,6 @@ def before_run hop_update_lantern_extension when "NotStarted" vm.sshable.cmd("common/bin/daemonizer 'sudo lantern/bin/update_docker_image' update_docker_image", stdin: JSON.generate({ - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, container_image: lantern_server.container_image })) when "Failed" diff --git a/rhizome/lantern/bin/configure b/rhizome/lantern/bin/configure index 7952a3d07..3825b47ae 100755 --- a/rhizome/lantern/bin/configure +++ b/rhizome/lantern/bin/configure @@ -148,7 +148,7 @@ end if $configure_hash[:skip_deps].nil? install_dependencies puts "dependencies installed" - configure_gcr($configure_hash["gcp_creds_gcr_b64"], $configure_hash["container_image"]) + configure_gcr($configure_hash["container_image"]) puts "GCR repo ready" end diff --git a/rhizome/lantern/bin/update_docker_image b/rhizome/lantern/bin/update_docker_image index 4e3d77293..40a97b4e0 100755 --- a/rhizome/lantern/bin/update_docker_image +++ b/rhizome/lantern/bin/update_docker_image @@ -9,9 +9,8 @@ require_relative "../lib/common" $configure_hash = JSON.parse($stdin.read) container_image = $configure_hash["container_image"] -gcp_creds_gcr_b64 = $configure_hash["gcp_creds_gcr_b64"] -configure_gcr(gcp_creds_gcr_b64, container_image) +configure_gcr(container_image) map = YAML.load_file $compose_file map["services"]["postgresql"]["image"] = container_image diff --git a/rhizome/lantern/lib/common.rb b/rhizome/lantern/lib/common.rb index e1b2dfea5..35e436251 100755 --- a/rhizome/lantern/lib/common.rb +++ b/rhizome/lantern/lib/common.rb @@ -11,8 +11,7 @@ $pg_mount_path = "#{$workdir}/pg" $container_name = "lantern-postgresql-1" -def configure_gcr(gcp_creds_gcr_b64, container_image) - r "echo #{gcp_creds_gcr_b64} | base64 -d | sudo docker login -u _json_key --password-stdin https://gcr.io" +def configure_gcr(container_image) r "sudo docker pull #{container_image}" end diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index cadebde21..981287a05 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -275,7 +275,6 @@ representative_server: lantern_server, restore_target: nil) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) expect(timeline).to receive(:generate_walg_config).and_return({gcp_creds_b64: "test-creds-push", walg_gs_prefix: "test-bucket-push"}).at_least(:once) expect(lantern_server).to receive(:resource).and_return(resource).at_least(:once) @@ -306,7 +305,6 @@ master_host: resource.representative_server.hostname, master_port: 5432, prom_password: Config.prom_password, - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64, gcp_creds_logging_b64: Config.gcp_creds_logging_b64, @@ -343,7 +341,6 @@ representative_server: lantern_server, restore_target: Time.now) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) expect(timeline).to receive(:latest_backup_label_before_target).and_return("test-label").at_least(:once) expect(timeline).to receive(:generate_walg_config).and_return({gcp_creds_b64: "test-creds-push", walg_gs_prefix: "test-bucket-push"}).at_least(:once) @@ -375,7 +372,6 @@ master_host: resource.representative_server.hostname, master_port: 5432, prom_password: Config.prom_password, - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64, gcp_creds_logging_b64: Config.gcp_creds_logging_b64, container_image: "#{Config.gcr_image}:lantern-#{lantern_server.lantern_version}-extras-#{lantern_server.extras_version}-minor-#{lantern_server.minor_version}", @@ -411,7 +407,6 @@ representative_server: lantern_server, restore_target: nil) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) expect(timeline).to receive(:generate_walg_config).and_return({gcp_creds_b64: "test-creds-push", walg_gs_prefix: "test-bucket-push"}).at_least(:once) expect(lantern_server).to receive(:resource).and_return(resource).at_least(:once) @@ -442,7 +437,6 @@ master_host: resource.representative_server.hostname, master_port: 5432, prom_password: Config.prom_password, - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64, gcp_creds_logging_b64: Config.gcp_creds_logging_b64, container_image: "#{Config.gcr_image}:lantern-#{lantern_server.lantern_version}-extras-#{lantern_server.extras_version}-minor-#{lantern_server.minor_version}", @@ -478,7 +472,6 @@ representative_server: lantern_server, restore_target: Time.now) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) expect(timeline).to receive(:generate_walg_config).and_return({gcp_creds_b64: "test-creds-push", walg_gs_prefix: "test-bucket-push"}).at_least(:once) @@ -510,7 +503,6 @@ master_host: resource.representative_server.hostname, master_port: 5432, prom_password: Config.prom_password, - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64, gcp_creds_logging_b64: Config.gcp_creds_logging_b64, diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index eb4a66b6c..74fc1f3e3 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -206,17 +206,10 @@ it "naps if timeline is not ready" do expect(lantern_server.timeline).to receive(:strand).and_return(instance_double(Strand, label: "start")) - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds") expect { nx.setup_docker_stack }.to nap(10) end - it "raises if gcr credentials are not provided" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return(nil) - expect { nx.setup_docker_stack }.to raise_error "GCP_CREDS_GCR_B64 is required to setup docker stack for Lantern" - end - it "calls setup if not started" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check configure_lantern").and_return("NotStarted") expect(lantern_server).to receive(:configure_hash).and_return("test") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer 'sudo lantern/bin/configure' configure_lantern", stdin: "test") @@ -224,7 +217,6 @@ end it "calls setup if failed" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check configure_lantern").and_return("Failed") expect(lantern_server).to receive(:configure_hash).and_return("test") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer 'sudo lantern/bin/configure' configure_lantern", stdin: "test") @@ -232,7 +224,6 @@ end it "calls add domain after succeeded" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check configure_lantern").and_return("Succeeded") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean configure_lantern") expect(nx).to receive(:frame).and_return({"domain" => "db.lantern.dev"}) @@ -243,7 +234,6 @@ end it "hop to wait_db_available" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check configure_lantern").and_return("Succeeded") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean configure_lantern") expect(nx).to receive(:frame).and_return({}) @@ -253,7 +243,6 @@ end it "naps if in progress" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check configure_lantern").and_return("InProgress") expect { nx.setup_docker_stack }.to nap(5) end @@ -523,11 +512,9 @@ describe "#update_image" do it "updates image and naps" do - expect(Config).to receive(:gcp_creds_gcr_b64).and_return("test-creds").at_least(:once) expect(lantern_server).to receive(:container_image).and_return("test-image").at_least(:once) expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check update_docker_image").and_return("NotStarted") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer 'sudo lantern/bin/update_docker_image' update_docker_image", stdin: JSON.generate({ - gcp_creds_gcr_b64: Config.gcp_creds_gcr_b64, container_image: lantern_server.container_image })) expect { nx.update_image }.to nap(10) From e07baa880ab50cc36db3f661a7ccace7c091809a Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 5 Nov 2024 17:23:20 +0400 Subject: [PATCH 02/13] fix lantern extras installation instructions --- rhizome/lantern/bin/update_extras | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rhizome/lantern/bin/update_extras b/rhizome/lantern/bin/update_extras index 5b35894e0..c05a32e60 100755 --- a/rhizome/lantern/bin/update_extras +++ b/rhizome/lantern/bin/update_extras @@ -15,8 +15,8 @@ end version = $configure_hash["version"] -r "rm -rf /tmp/lantern-extras* || true" -r "wget https://github.com/lanterndata/lantern_extras/releases/download/#{version}/lantern-extras-#{version}.tar -O /tmp/lantern-extras.tar" -r "cd /tmp && tar xf lantern-extras.tar" -r "cd /tmp/lantern-extras-#{version} && PG_CONFIG=#{$pg_mount_path}/bin/pg_config make install" -r "rm -rf /tmp/lantern-extras*" +r "rm -rf /tmp/lantern-* || true" +r "wget https://github.com/lanterndata/lantern/releases/download/v#{version}/lantern-#{version}.tar -O /tmp/lantern.tar" +r "cd /tmp && tar xf lantern.tar" +r "cd /tmp/lantern-#{version} && PG_CONFIG=#{$pg_mount_path}/bin/pg_config make -C lantern-extras-#{version} install" +r "rm -rf /tmp/lantern*" From f615e29d3fa06c8ebe337b3cdec041a06d9d58ff Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Thu, 7 Nov 2024 13:10:59 +0400 Subject: [PATCH 03/13] keep pg_version information in lantern_resource --- migrate/20241107_lantern_resource_pg_version.rb | 10 ++++++++++ model/lantern/lantern_server.rb | 3 ++- prog/lantern/lantern_resource_nexus.rb | 4 ++-- rhizome/lantern/bin/configure | 4 ++-- rhizome/lantern/bin/update_docker_image | 2 +- rhizome/lantern/lib/common.rb | 4 ++-- spec/model/lantern/lantern_server_spec.rb | 16 ++++++++++++---- 7 files changed, 31 insertions(+), 12 deletions(-) create mode 100644 migrate/20241107_lantern_resource_pg_version.rb diff --git a/migrate/20241107_lantern_resource_pg_version.rb b/migrate/20241107_lantern_resource_pg_version.rb new file mode 100644 index 000000000..ec000df4b --- /dev/null +++ b/migrate/20241107_lantern_resource_pg_version.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +Sequel.migration do + change do + alter_table(:lantern_resource) do + add_column :pg_version, Integer, default: 17 + end + run "UPDATE lantern_resource SET pg_version=15" + end +end diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index 0ca7cd434..faefe348d 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -149,7 +149,8 @@ def configure_hash gcp_creds_walg_b64: walg_config[:gcp_creds_b64], walg_gs_prefix: walg_config[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset + big_query_dataset: Config.lantern_log_dataset, + pg_version: resource.pg_version }) end diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index 3981aa33e..9a0a2bc2a 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -13,7 +13,7 @@ class Prog::Lantern::LanternResourceNexus < Prog::Base def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage_size_gib:, ubid: LanternResource.generate_ubid, ha_type: LanternResource::HaType::NONE, parent_id: nil, restore_target: nil, recovery_target_lsn: nil, org_id: nil, db_name: "postgres", db_user: "postgres", db_user_password: nil, superuser_password: nil, repl_password: nil, app_env: Config.rack_env, lantern_version: Config.lantern_default_version, extras_version: Config.lantern_extras_default_version, minor_version: Config.lantern_minor_default_version, domain: nil, enable_debug: false, - label: "", version_upgrade: false, logical_replication: false, max_storage_autoresize_gib: 0) + label: "", version_upgrade: false, logical_replication: false, max_storage_autoresize_gib: 0, pg_version: 17) unless (project = Project[project_id]) fail "No existing project" end @@ -82,7 +82,7 @@ def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage restore_target: restore_target, db_name: db_name, db_user: db_user, db_user_password: db_user_password, repl_user: repl_user, repl_password: repl_password, label: label, doctor_id: lantern_doctor.id, recovery_target_lsn: recovery_target_lsn, version_upgrade: version_upgrade, - logical_replication: logical_replication + logical_replication: logical_replication, pg_version: pg_version ) { _1.id = ubid.to_uuid } lantern_resource.associate_with_project(project) diff --git a/rhizome/lantern/bin/configure b/rhizome/lantern/bin/configure index 3825b47ae..6826255a4 100755 --- a/rhizome/lantern/bin/configure +++ b/rhizome/lantern/bin/configure @@ -112,7 +112,7 @@ def setup_initial_compose_file "ports" => ["5432:5432", "6432:6432"], "volumes" => ["/var/lib/lantern-data:/bitnami/postgresql"], "deploy" => { - "restart_policy" => {"condition" => "always"}, + "restart_policy" => {"condition" => "on-failure"}, "resources" => {"limits" => {"memory" => memory_sizes[:shared_bufs]}} }, "env_file" => $env_file, @@ -159,5 +159,5 @@ setup_env puts ".env setted up" setup_initial_compose_file puts "docker-compose.yaml ready" -run_database($configure_hash["container_image"]) +run_database($configure_hash["container_image"], $configure_hash["pg_version"]) puts "database ready" diff --git a/rhizome/lantern/bin/update_docker_image b/rhizome/lantern/bin/update_docker_image index 40a97b4e0..8d6cae12c 100755 --- a/rhizome/lantern/bin/update_docker_image +++ b/rhizome/lantern/bin/update_docker_image @@ -17,4 +17,4 @@ map["services"]["postgresql"]["image"] = container_image File.open($compose_file, "w") { |f| YAML.dump(map, f) } r "sudo docker compose -f #{$compose_file} down" -run_database(container_image) +run_database(container_image, $configure_hash["pg_version"]) diff --git a/rhizome/lantern/lib/common.rb b/rhizome/lantern/lib/common.rb index 35e436251..892bbf2f2 100755 --- a/rhizome/lantern/lib/common.rb +++ b/rhizome/lantern/lib/common.rb @@ -37,9 +37,9 @@ def wait_for_pg end end -def run_database(container_image) +def run_database(container_image, pg_version) # Run database - volume_mount = "#{$pg_mount_path}:/opt/bitnami/postgresql" + volume_mount = "#{$pg_mount_path}:/usr/lib/postgresql/#{pg_version}" # Copy postgres fs to host to mount r "sudo rm -rf #{$pg_mount_path}" data = YAML.load_file $compose_file diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index 981287a05..3587e0458 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -273,6 +273,7 @@ gcp_creds_b64: "test-creds", recovery_target_lsn: nil, representative_server: lantern_server, + pg_version: 17, restore_target: nil) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) @@ -315,7 +316,8 @@ gcp_creds_walg_b64: walg_conf[:gcp_creds_b64], walg_gs_prefix: walg_conf[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset + big_query_dataset: Config.lantern_log_dataset, + pg_version: 17 }) expect(lantern_server.configure_hash).to eq(expected_conf) end @@ -339,6 +341,7 @@ gcp_creds_b64: "test-creds", recovery_target_lsn: nil, representative_server: lantern_server, + pg_version: 17, restore_target: Time.now) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) @@ -381,7 +384,8 @@ gcp_creds_walg_b64: walg_conf[:gcp_creds_b64], walg_gs_prefix: walg_conf[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset + big_query_dataset: Config.lantern_log_dataset, + pg_version: 17 }) expect(lantern_server.configure_hash).to eq(expected_conf) end @@ -405,6 +409,7 @@ gcp_creds_b64: "test-creds", recovery_target_lsn: "16/B374D848", representative_server: lantern_server, + pg_version: 17, restore_target: nil) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) @@ -446,7 +451,8 @@ gcp_creds_walg_b64: walg_conf[:gcp_creds_b64], walg_gs_prefix: walg_conf[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset + big_query_dataset: Config.lantern_log_dataset, + pg_version: 17 }) expect(lantern_server.configure_hash).to eq(expected_conf) end @@ -470,6 +476,7 @@ gcp_creds_b64: "test-creds", recovery_target_lsn: "16/B374D848", representative_server: lantern_server, + pg_version: 17, restore_target: Time.now) expect(Config).to receive(:prom_password).and_return("pwd123").at_least(:once) expect(Config).to receive(:gcp_creds_logging_b64).and_return("test-creds").at_least(:once) @@ -513,7 +520,8 @@ gcp_creds_walg_b64: walg_conf[:gcp_creds_b64], walg_gs_prefix: walg_conf[:walg_gs_prefix], gcp_creds_big_query_b64: resource.gcp_creds_b64, - big_query_dataset: Config.lantern_log_dataset + big_query_dataset: Config.lantern_log_dataset, + pg_version: 17 }) expect(lantern_server.configure_hash).to eq(expected_conf) end From b578e79a00292ccbfa31e88892e648873e799d53 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Thu, 7 Nov 2024 18:29:04 +0400 Subject: [PATCH 04/13] add pg_upgrade script --- model/lantern/lantern_resource.rb | 6 +- model/lantern/lantern_server.rb | 10 +- prog/lantern/lantern_resource_nexus.rb | 5 +- prog/lantern/lantern_server_nexus.rb | 59 ++++++++++- rhizome/lantern/bin/configure | 3 - rhizome/lantern/bin/run_pg_upgrade | 47 ++++++++ .../prog/lantern/lantern_server_nexus_spec.rb | 100 +++++++++++++++++- 7 files changed, 215 insertions(+), 15 deletions(-) create mode 100755 rhizome/lantern/bin/run_pg_upgrade diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index 3cca21879..d4aed99e6 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -199,7 +199,7 @@ def disable_logical_subscription representative_server.run_query_all("ALTER SUBSCRIPTION sub_#{ubid} DISABLE") end - def create_logical_replica(lantern_version: nil, extras_version: nil, minor_version: nil) + def create_logical_replica(lantern_version: nil, extras_version: nil, minor_version: nil, pg_upgrade: nil) # TODO:: # 1. If new database will be created during logical replication it won't be added automatically # 2. New timeline will be generated for lantern resource @@ -224,7 +224,9 @@ def create_logical_replica(lantern_version: nil, extras_version: nil, minor_vers logical_replication: true, lantern_version: lantern_version || representative_server.lantern_version, extras_version: extras_version || representative_server.extras_version, - minor_version: minor_version || representative_server.minor_version + minor_version: minor_version || representative_server.minor_version, + pg_version: pg_version, + pg_upgrade: pg_upgrade ) end diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index faefe348d..e8250d43c 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -99,6 +99,10 @@ def instance_type standby? ? "reader" : "writer" end + def container_image + "#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}" + end + def configure_hash walg_config = timeline.generate_walg_config backup_label = "" @@ -142,7 +146,7 @@ def configure_hash prom_password: Config.prom_password, gcp_creds_coredumps_b64: Config.gcp_creds_coredumps_b64, gcp_creds_logging_b64: Config.gcp_creds_logging_b64, - container_image: "#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}", + container_image: container_image, postgresql_recover_from_backup: backup_label, postgresql_recovery_target_time: postgresql_recovery_target_time, postgresql_recovery_target_lsn: postgresql_recovery_target_lsn, @@ -174,10 +178,6 @@ def update_walg_creds ])) end - def container_image - "#{Config.gcr_image}:lantern-#{lantern_version}-extras-#{extras_version}-minor-#{minor_version}" - end - def init_health_monitor_session if strand.label != "wait" fail "server is not ready to initialize session" diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index 9a0a2bc2a..e2cc8030f 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -13,7 +13,7 @@ class Prog::Lantern::LanternResourceNexus < Prog::Base def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage_size_gib:, ubid: LanternResource.generate_ubid, ha_type: LanternResource::HaType::NONE, parent_id: nil, restore_target: nil, recovery_target_lsn: nil, org_id: nil, db_name: "postgres", db_user: "postgres", db_user_password: nil, superuser_password: nil, repl_password: nil, app_env: Config.rack_env, lantern_version: Config.lantern_default_version, extras_version: Config.lantern_extras_default_version, minor_version: Config.lantern_minor_default_version, domain: nil, enable_debug: false, - label: "", version_upgrade: false, logical_replication: false, max_storage_autoresize_gib: 0, pg_version: 17) + label: "", version_upgrade: false, logical_replication: false, max_storage_autoresize_gib: 0, pg_version: 17, pg_upgrade: nil) unless (project = Project[project_id]) fail "No existing project" end @@ -97,7 +97,8 @@ def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage timeline_id: timeline_id, timeline_access: timeline_access, max_storage_autoresize_gib: max_storage_autoresize_gib, - representative_at: Time.now + representative_at: Time.now, + pg_upgrade: pg_upgrade ) lantern_resource.required_standby_count.times do diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index 7a38920de..29abfc174 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -10,13 +10,13 @@ class Prog::Lantern::LanternServerNexus < Prog::Base extend Forwardable def_delegators :lantern_server, :vm - semaphore :initial_provisioning, :update_user_password, :update_lantern_extension, :update_extras_extension, :update_image, :add_domain, :update_rhizome, :checkup + semaphore :initial_provisioning, :update_user_password, :update_lantern_extension, :update_extras_extension, :update_image, :add_domain, :update_rhizome, :checkup, :run_pg_upgrade semaphore :start_server, :stop_server, :restart_server, :take_over, :destroy, :update_storage_size, :update_vm_size, :update_memory_limits, :init_sql, :restart, :container_stopped, :setup_ssl def self.assemble( resource_id: nil, lantern_version: "0.2.2", extras_version: "0.1.4", minor_version: "1", domain: nil, timeline_access: "push", representative_at: nil, target_vm_size: nil, target_storage_size_gib: 50, timeline_id: nil, - max_storage_autoresize_gib: 0 + max_storage_autoresize_gib: 0, pg_upgrade: nil ) DB.transaction do @@ -54,6 +54,10 @@ def self.assemble( ) stack_frame = domain.nil? ? {} : {domain: domain} + + if pg_upgrade + stack_frame["pg_upgrade"] = pg_upgrade + end Strand.create(prog: "Lantern::LanternServerNexus", label: "start", stack: [stack_frame]) { _1.id = lantern_server.id } end end @@ -198,6 +202,8 @@ def before_run end end + current_frame = strand.stack.first + if !is_in_recovery timeline_id = Prog::Lantern::LanternTimelineNexus.assemble(parent_id: lantern_server.timeline.id).id lantern_server.timeline_id = timeline_id @@ -218,6 +224,8 @@ def before_run incr_update_extras_extension lantern_server.update(extras_version: extras_version) end + elsif !current_frame["pg_upgrade"].nil? + incr_run_pg_upgrade end hop_wait_timeline_available @@ -226,6 +234,49 @@ def before_run nap 5 end + label def run_pg_upgrade + decr_run_pg_upgrade + current_frame = strand.stack.first + resource = lantern_server.resource + pg_upgrade_info = current_frame["pg_upgrade"] + # prepare files + lantern_server.update( + lantern_version: pg_upgrade_info["lantern_version"], + extras_version: pg_upgrade_info["extras_version"], + minor_version: pg_upgrade_info["minor_version"] + ) + vm.sshable.cmd( + "common/bin/daemonizer 'sudo lantern/bin/run_pg_upgrade' pg_upgrade", + stdin: JSON.generate({ + container_image: lantern_server.container_image, + old_pg_version: resource.pg_version + }) + ) + resource.update(pg_version: pg_upgrade_info["pg_version"]) + # run scripts + hop_wait_pg_upgrade + end + + label def wait_pg_upgrade + current_frame = strand.stack.first + case vm.sshable.cmd("common/bin/daemonizer --check pg_upgrade") + when "Succeeded" + current_frame.delete("pg_upgrade") + strand.modified!(:stack) + strand.save_changes + vm.sshable.cmd("common/bin/daemonizer --clean pg_upgrade") + register_deadline(:wait, 40 * 60) + hop_init_sql + when "Failed" + logs = JSON.parse(vm.sshable.cmd("common/bin/daemonizer --logs pg_upgrade")) + Clog.emit("Postgres upgrade failed") { {logs: logs, name: lantern_server.resource.name, lantern_server: lantern_server.id} } + Prog::PageNexus.assemble_with_logs("Postgres update failed on #{lantern_server.resource.name} (#{lantern_server.resource.label})", [lantern_server.resource.ubid, lantern_server.ubid], logs, "critical", "LanternPGUpgradeFailed", lantern_server.ubid) + vm.sshable.cmd("common/bin/daemonizer --clean pg_upgrade") + hop_wait + end + nap 10 + end + label def wait_timeline_available nap 10 if lantern_server.timeline.strand.label == "start" lantern_server.update_walg_creds @@ -419,6 +470,10 @@ def remove_domain_from_stack end end + when_run_pg_upgrade_set? do + hop_run_pg_upgrade + end + when_update_user_password_set? do hop_update_user_password end diff --git a/rhizome/lantern/bin/configure b/rhizome/lantern/bin/configure index 6826255a4..eaae037cf 100755 --- a/rhizome/lantern/bin/configure +++ b/rhizome/lantern/bin/configure @@ -7,9 +7,6 @@ require_relative "../../common/lib/util" require_relative "../lib/common" $configure_hash = JSON.parse($stdin.read) -File.open("test.json", "a") do |f| - f.puts($configure_hash.to_json) -end def install_dependencies # Install dependencies diff --git a/rhizome/lantern/bin/run_pg_upgrade b/rhizome/lantern/bin/run_pg_upgrade new file mode 100755 index 000000000..29b1887ef --- /dev/null +++ b/rhizome/lantern/bin/run_pg_upgrade @@ -0,0 +1,47 @@ +#!/bin/env ruby +# frozen_string_literal: true + +require "json" +require "yaml" +require_relative "../../common/lib/util" +require_relative "../lib/common" + +$configure_hash = JSON.parse($stdin.read) +data = YAML.load_file $compose_file + +container_image = $configure_hash["container_image"] +old_pg_version = $configure_hash["old_pg_version"] +current_container_image = data["services"]["postgresql"]["image"] + +r "sudo docker compose -f #{$compose_file} down -t 10" +r "sudo mv #{$datadir}/data #{$datadir}/old-data-#{old_pg_version}" +r "sudo docker rm -f tc 2>/dev/null || true" +r "sudo docker create --name tc #{current_container_image}" +r "sudo docker cp tc:/usr/lib/postgresql/#{old_pg_version}/lib #{$datadir}/old-lib-#{old_pg_version}" +r "sudo docker cp tc:/usr/lib/postgresql/#{old_pg_version}/bin #{$datadir}/old-bin-#{old_pg_version}" +r "sudo docker cp tc:/usr/share/postgresql/#{old_pg_version} #{$datadir}/old-share-#{old_pg_version}" +r "sudo docker rm tc" +r "sudo chown -R 1001:1001 #{$datadir}" + +append_env([ + ["POSTGRESQL_RUN_PGUPGRADE", "yes"], + ["PGUPGRADE_OLD_VERSION", old_pg_version] +]) + +data["services"]["postgresql"]["image"] = container_image +data["services"]["postgresql"]["user"] = "root" +File.open($compose_file, "w") { |f| YAML.dump(data, f) } + +r "sudo docker compose -f #{$compose_file} up" + +data = YAML.load_file $compose_file +data["services"]["postgresql"].delete("user") +r "sudo rm -rf #{$datadir}/old-lib-#{old_pg_version} #{$datadir}/old-bin-#{old_pg_version} #{$datadir}/old-share-#{old_pg_version}" +r "sudo chown -R 1001:1001 #{$datadir}" +File.open($compose_file, "w") { |f| YAML.dump(data, f) } + +append_env([ + ["POSTGRESQL_RUN_PGUPGRADE", "no"] +]) + +r "sudo docker compose -f #{$compose_file} up -d" diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index 74fc1f3e3..640a1dc3b 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -18,12 +18,14 @@ resource: instance_double(LanternResource, org_id: 0, name: "test", + label: "none", db_name: "postgres", db_user: "postgres", service_account_name: "test-sa", gcp_creds_b64: "test-creds", version_upgrade: false, - superuser_password: "pwd123"), + superuser_password: "pwd123", + pg_version: 15), vm: instance_double( GcpVm, id: "104b0033-b3f6-8214-ae27-0cd3cef18ce4", @@ -76,6 +78,31 @@ expect(lantern_server).not_to be_nil end + it "creates lantern server as primary with upgrade info" do + project = Project.create_with_id(name: "default", provider: "gcp").tap { _1.associate_with_project(_1) } + lantern_resource = instance_double(LanternResource, + name: "test", + project_id: project.id, + location: "us-central1") + + expect(LanternResource).to receive(:[]).and_return(lantern_resource) + + st = described_class.assemble( + resource_id: "6ae7e513-c34a-8039-a72a-7be45b53f2a0", + lantern_version: "0.2.0", + extras_version: "0.1.3", + minor_version: "2", + target_vm_size: "n1-standard-2", + target_storage_size_gib: 50, + representative_at: Time.now, + domain: "db.lantern.dev", + pg_upgrade: {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17} + ) + + lantern_server = LanternServer[st.id] + expect(lantern_server).not_to be_nil + end + it "creates lantern server as standby" do project = Project.create_with_id(name: "default", provider: "gcp").tap { _1.associate_with_project(_1) } lantern_resource = instance_double(LanternResource, @@ -386,6 +413,20 @@ expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") end + it "run pg_upgrade if frame has pg_upgrade info" do + expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) + expect(lantern_server).to receive(:run_query).and_return("f") + expect(lantern_server).to receive(:timeline_id=) + expect(lantern_server).to receive(:timeline_access=).with("push") + expect(lantern_server).to receive(:save_changes) + frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} + expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.resource).to receive(:version_upgrade).and_return(true) + expect(Prog::Lantern::LanternTimelineNexus).to receive(:assemble).and_return(instance_double(Strand, id: "104b0033-b3f6-8214-ae27-0cd3cef18ce5")) + expect(nx).to receive(:incr_run_pg_upgrade) + expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") + end + it "nap 5" do expect(lantern_server).to receive(:run_query).and_return("t", "unk") expect { nx.wait_recovery_completion }.to nap(5) @@ -687,6 +728,11 @@ expect { nx.wait }.to hop("restart_server") end + it "hops to run_pg_upgrade" do + nx.incr_run_pg_upgrade + expect { nx.wait }.to hop("run_pg_upgrade") + end + it "hops to start_server" do nx.incr_start_server expect { nx.wait }.to hop("start_server") @@ -1026,4 +1072,56 @@ expect { nx.container_stopped }.to nap(15) end end + + describe "#run_pg_upgrade" do + it "runs pg_upgrade" do + expect(nx).to receive(:decr_run_pg_upgrade) + image = "#{Config.gcr_image}:lantern-0.5.0-extras-0.5.0-minor-1" + frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} + expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server).to receive(:update).with(extras_version: "0.5.0", lantern_version: "0.5.0", minor_version: "1") + expect(lantern_server.resource).to receive(:update).with(pg_version: 17) + expect(lantern_server).to receive(:container_image).and_return(image).at_least(:once) + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer 'sudo lantern/bin/run_pg_upgrade' pg_upgrade", stdin: JSON.generate( + container_image: lantern_server.container_image, + old_pg_version: lantern_server.resource.pg_version + )) + + expect { nx.run_pg_upgrade }.to hop("wait_pg_upgrade") + end + end + + describe "#wait_pg_upgrade" do + it "waits pg_upgrade and nap" do + frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} + expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check pg_upgrade").and_return("InProgress") + expect { nx.wait_pg_upgrade }.to nap 10 + end + + it "waits pg_upgrade and fail" do + frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} + expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.resource).to receive(:ubid).and_return("test").at_least(:once) + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check pg_upgrade").and_return("Failed") + + logs = {"stdout" => "", "stderr" => "error happened"} + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --logs pg_upgrade").and_return(JSON.generate(logs)) + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean pg_upgrade") + expect(Prog::PageNexus).to receive(:assemble_with_logs).with("Postgres update failed on #{lantern_server.resource.name} (#{lantern_server.resource.label})", [lantern_server.resource.ubid, lantern_server.ubid], logs, "critical", "LanternPGUpgradeFailed", lantern_server.ubid) + expect { nx.wait_pg_upgrade }.to hop("wait") + end + + it "waits pg_upgrade and succeed" do + frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} + expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check pg_upgrade").and_return("Succeeded") + expect(frame).to receive(:delete).with("pg_upgrade") + expect(nx.strand).to receive(:modified!).with(:stack) + expect(nx.strand).to receive(:save_changes) + expect(nx).to receive(:register_deadline).with(:wait, 40 * 60) + expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean pg_upgrade") + expect { nx.wait_pg_upgrade }.to hop("init_sql") + end + end end From 4fc60f9fa6e7ed76de813b5862003c77e19981af Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Fri, 8 Nov 2024 18:58:49 +0400 Subject: [PATCH 05/13] fix pg_upgrade script, add dns failover mechanism and set it as default --- config.rb | 1 + model/lantern/lantern_resource.rb | 18 ++- model/lantern/lantern_server.rb | 26 +++- prog/lantern/lantern_resource_nexus.rb | 59 ++++++++- prog/lantern/lantern_server_nexus.rb | 50 +++++--- prog/lantern/lantern_timeline_nexus.rb | 2 +- rhizome/lantern/bin/run_pg_upgrade | 5 +- rhizome/lantern/lib/common.rb | 2 +- spec/model/lantern/lantern_resource_spec.rb | 23 +++- spec/model/lantern/lantern_server_spec.rb | 54 +++++++- .../lantern/lantern_resource_nexus_spec.rb | 115 +++++++++++++++--- .../prog/lantern/lantern_server_nexus_spec.rb | 38 +++++- .../lantern/lantern_timeline_nexus_spec.rb | 1 + 13 files changed, 342 insertions(+), 52 deletions(-) diff --git a/config.rb b/config.rb index 1ce786887..8e8ae96bc 100644 --- a/config.rb +++ b/config.rb @@ -152,6 +152,7 @@ def self.e2e_test? override :lantern_backup_bucket, "walg-dev-backups" override :e2e_test, "0" override :backup_retention_days, 7, int + override :backup_retention_days_after_deletion, 0, int override :lantern_log_dataset, "lantern_logs", string override :compose_file, "/var/lib/lantern/docker-compose.yaml", string diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index d4aed99e6..800196663 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -21,7 +21,7 @@ class LanternResource < Sequel::Model include Authorization::HyperTagMethods include Authorization::TaggableMethods - semaphore :destroy, :swap_leaders_with_parent + semaphore :destroy, :swap_leaders_with_parent, :switchover_with_parent plugin :column_encryption do |enc| enc.column :superuser_password @@ -74,8 +74,13 @@ def dissociate_forks def setup_service_account api = Hosting::GcpApis.new service_account = api.create_service_account("lt-#{ubid}", "Service Account for Lantern #{name}") - key = api.export_service_account_key(service_account["email"]) - update(gcp_creds_b64: key, service_account_name: service_account["email"]) + update(service_account_name: service_account["email"]) + end + + def export_service_account_key + api = Hosting::GcpApis.new + key = api.export_service_account_key(service_account_name) + update(gcp_creds_b64: key) end def allow_timeline_access_to_bucket @@ -130,6 +135,13 @@ def create_ddl_log representative_server.run_query_all(commands) end + def drop_ddl_log_trigger + commands = < "on-failure"} r "sudo rm -rf #{$datadir}/old-lib-#{old_pg_version} #{$datadir}/old-bin-#{old_pg_version} #{$datadir}/old-share-#{old_pg_version}" r "sudo chown -R 1001:1001 #{$datadir}" File.open($compose_file, "w") { |f| YAML.dump(data, f) } @@ -44,4 +47,4 @@ append_env([ ["POSTGRESQL_RUN_PGUPGRADE", "no"] ]) -r "sudo docker compose -f #{$compose_file} up -d" +run_database(container_image, pg_version) diff --git a/rhizome/lantern/lib/common.rb b/rhizome/lantern/lib/common.rb index 892bbf2f2..b3b192eca 100755 --- a/rhizome/lantern/lib/common.rb +++ b/rhizome/lantern/lib/common.rb @@ -47,7 +47,7 @@ def run_database(container_image, pg_version) File.open($compose_file, "w") { |f| YAML.dump(data, f) } r "sudo docker rm -f tc 2>/dev/null || true" r "sudo docker create --name tc #{container_image}" - r "sudo docker cp tc:/opt/bitnami/postgresql #{$pg_mount_path}" + r "sudo docker cp tc:/usr/lib/postgresql/#{pg_version} #{$pg_mount_path}" r "sudo docker rm tc" r "sudo chown -R 1001:1001 #{$pg_mount_path}" # Mount extension dir, so we can make automatic updates from host diff --git a/spec/model/lantern/lantern_resource_spec.rb b/spec/model/lantern/lantern_resource_spec.rb index 72b5c4e5b..e2a2e2c4d 100644 --- a/spec/model/lantern/lantern_resource_spec.rb +++ b/spec/model/lantern/lantern_resource_spec.rb @@ -92,12 +92,22 @@ it "sets up service account and updates resource" do api = instance_double(Hosting::GcpApis) allow(Hosting::GcpApis).to receive(:new).and_return(api) - allow(api).to receive_messages(create_service_account: {"email" => "test-sa"}, export_service_account_key: "test-key") - expect(lantern_resource).to receive(:update).with(gcp_creds_b64: "test-key", service_account_name: "test-sa") + allow(api).to receive_messages(create_service_account: {"email" => "test-sa"}) + expect(lantern_resource).to receive(:update).with(service_account_name: "test-sa") expect { lantern_resource.setup_service_account }.not_to raise_error end end + describe "#export_service_account_key" do + it "exports service account key and updates resource" do + api = instance_double(Hosting::GcpApis) + allow(Hosting::GcpApis).to receive(:new).and_return(api) + allow(api).to receive_messages(export_service_account_key: "test-key") + expect(lantern_resource).to receive(:update).with(gcp_creds_b64: "test-key") + expect { lantern_resource.export_service_account_key }.not_to raise_error + end + end + describe "#create_logging_table" do it "create bigquery table and gives access" do instance_double(LanternTimeline, ubid: "test") @@ -144,6 +154,15 @@ end end + describe "#drop_ddl_log_trigger" do + it "drops ddl log trigger" do + representative_server = instance_double(LanternServer) + expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(lantern_resource.representative_server).to receive(:run_query_all).with(a_string_matching(/DROP .* log_ddl_trigger/)) + expect { lantern_resource.drop_ddl_log_trigger }.not_to raise_error + end + end + describe "#listen_ddl_log" do it "listends ddl log table" do representative_server = instance_double(LanternServer) diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index 3587e0458..6d4702736 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -146,19 +146,19 @@ end it "runs query on vm" do - expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U postgres -t --csv postgres", stdin: "SELECT 1").and_return("1\n") + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U postgres -t --csv -v ON_ERROR_STOP=1 postgres", stdin: "SELECT 1").and_return("1\n") expect(lantern_server.run_query("SELECT 1")).to eq("1") end it "runs query on vm with different user and db" do - expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U lantern -t --csv db2", stdin: "SELECT 1").and_return("1\n") + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U lantern -t --csv -v ON_ERROR_STOP=1 db2", stdin: "SELECT 1").and_return("1\n") expect(lantern_server.run_query("SELECT 1", db: "db2", user: "lantern")).to eq("1") end it "runs query on vm for all databases" do expect(lantern_server).to receive(:list_all_databases).and_return(["postgres", "db2"]) - expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U postgres -t --csv postgres", stdin: "SELECT 1").and_return("1\n") - expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U postgres -t --csv db2", stdin: "SELECT 1").and_return("2\n") + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U postgres -t --csv -v ON_ERROR_STOP=1 postgres", stdin: "SELECT 1").and_return("1\n") + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U postgres -t --csv -v ON_ERROR_STOP=1 db2", stdin: "SELECT 1").and_return("2\n") expect(lantern_server.run_query_all("SELECT 1")).to eq( [ ["postgres", "1"], @@ -783,4 +783,50 @@ expect(lantern_server.query_string).to be_nil end end + + describe "#swap_dns" do + it "swaps domains with another server" do + frame = {} + other = instance_double(described_class) + strand = instance_double(Strand) + expect(lantern_server).to receive(:strand).and_return(strand).at_least(:once) + expect(lantern_server.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.strand).to receive(:modified!).with(:stack) + expect(lantern_server.strand).to receive(:save_changes) + expect(lantern_server).to receive(:incr_add_domain) + expect(other).to receive(:domain).and_return("test") + expect(other).to receive(:update).with(domain: nil) + expect { lantern_server.swap_dns(other) }.not_to raise_error + end + end + + describe "#is_dns_correct?" do + it "returns true if host matches ip" do + expect(lantern_server).to receive(:domain).and_return("test-domain").at_least(:once) + expect(vm.sshable).to receive(:host).and_return("127.0.0.1").at_least(:once) + expect(Resolv).to receive(:getaddress).with("test-domain").and_return("127.0.0.1").at_least(:once) + expect(lantern_server.is_dns_correct?).to be(true) + end + + it "returns false if host does not match the ip" do + expect(lantern_server).to receive(:domain).and_return("test-domain").at_least(:once) + expect(vm.sshable).to receive(:host).and_return("127.0.0.1").at_least(:once) + expect(Resolv).to receive(:getaddress).with("test-domain").and_return("127.0.1.1").at_least(:once) + expect(lantern_server.is_dns_correct?).to be(false) + end + end + + describe "#stop_container" do + it "stops docker container" do + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f #{Config.compose_file} down -t 60 || true") + expect { lantern_server.stop_container }.not_to raise_error + end + end + + describe "#start_container" do + it "starts docker container" do + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f #{Config.compose_file} up -d") + expect { lantern_server.start_container }.not_to raise_error + end + end end diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index 33437db16..74f799e08 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -126,31 +126,54 @@ end describe "#start" do - it "sets up gcp service account and allows bucket usage" do - expect(lantern_resource).to receive(:setup_service_account) - expect(lantern_resource).to receive(:create_logging_table) + it "hops to setup_service_account" do + expect { nx.start }.to hop("setup_service_account") + end + + # it "buds trigger_pg_current_xact_id_on_parent if it has parent" do + # expect(lantern_resource.representative_server.vm).to receive(:strand).and_return(instance_double(Strand, label: "wait")) + # expect(nx).to receive(:register_deadline) + # expect(lantern_resource).to receive(:parent).and_return(instance_double(LanternResource)) + # expect(nx).to receive(:bud).with(described_class, {}, :trigger_pg_current_xact_id_on_parent) + # expect { nx.start }.to hop("wait_servers") + # end + end + + describe "#setup_timeline_access" do + it "allows bucket usage" do expect(lantern_resource).to receive(:parent_id).and_return("test-parent") expect(lantern_resource).not_to receive(:allow_timeline_access_to_bucket) expect(nx).to receive(:register_deadline) - expect { nx.start }.to hop("wait_servers") + expect { nx.setup_timeline_access }.to hop("wait_servers") end it "sets up gcp service account" do - expect(lantern_resource).to receive(:setup_service_account) - expect(lantern_resource).to receive(:create_logging_table) expect(lantern_resource).to receive(:parent_id).and_return(nil) expect(lantern_resource).to receive(:allow_timeline_access_to_bucket) expect(nx).to receive(:register_deadline) - expect { nx.start }.to hop("wait_servers") + expect { nx.setup_timeline_access }.to hop("wait_servers") end + end - # it "buds trigger_pg_current_xact_id_on_parent if it has parent" do - # expect(lantern_resource.representative_server.vm).to receive(:strand).and_return(instance_double(Strand, label: "wait")) - # expect(nx).to receive(:register_deadline) - # expect(lantern_resource).to receive(:parent).and_return(instance_double(LanternResource)) - # expect(nx).to receive(:bud).with(described_class, {}, :trigger_pg_current_xact_id_on_parent) - # expect { nx.start }.to hop("wait_servers") - # end + describe "#create_logging_table" do + it "hops to setup_timeline_access" do + expect(lantern_resource).to receive(:create_logging_table) + expect { nx.create_logging_table }.to hop("setup_timeline_access") + end + end + + describe "#setup_service_account" do + it "hops to export_service_account_key" do + expect(lantern_resource).to receive(:setup_service_account) + expect { nx.setup_service_account }.to hop("export_service_account_key") + end + end + + describe "#export_service_account_key" do + it "hops to create_logging_table" do + expect(lantern_resource).to receive(:export_service_account_key) + expect { nx.export_service_account_key }.to hop("create_logging_table") + end end # describe "#wait_trigger_pg_current_xact_id_on_parent" do @@ -226,6 +249,16 @@ expect { nx.wait }.to nap(30) end + it "naps if no parent on swap_dns" do + expect(lantern_resource).to receive(:required_standby_count).and_return(0) + expect(lantern_resource).to receive(:display_state).and_return(nil) + expect(lantern_resource).to receive(:servers).and_return([instance_double(LanternServer, strand: instance_double(Strand, label: "wait"))]).at_least(:once) + expect(nx).to receive(:when_switchover_with_parent_set?).and_yield + expect(lantern_resource).to receive(:parent).and_return(nil) + expect(nx).to receive(:decr_switchover_with_parent) + expect { nx.wait }.to nap(30) + end + it "hops to swap_leaders" do expect(lantern_resource).to receive(:required_standby_count).and_return(0) expect(lantern_resource).to receive(:display_state).and_return(nil) @@ -237,6 +270,18 @@ expect(lantern_resource).to receive(:update).with(display_state: "failover") expect { nx.wait }.to hop("swap_leaders_with_parent") end + + it "hops to swap_dns" do + expect(lantern_resource).to receive(:required_standby_count).and_return(0) + expect(lantern_resource).to receive(:display_state).and_return(nil) + expect(lantern_resource).to receive(:servers).and_return([instance_double(LanternServer, strand: instance_double(Strand, label: "wait"))]).at_least(:once) + expect(nx).to receive(:when_switchover_with_parent_set?).and_yield + parent = instance_double(LanternResource) + expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) + expect(parent).to receive(:update).with(display_state: "failover") + expect(lantern_resource).to receive(:update).with(display_state: "failover") + expect { nx.wait }.to hop("switchover_with_parent") + end end describe "#destroy" do @@ -336,4 +381,46 @@ expect { nx.update_hosts }.to hop("wait") end end + + describe "#switchover_with_parent" do + it "sets parent to readonly and hop" do + parent = instance_double(LanternResource) + expect(lantern_resource).to receive(:parent).and_return(parent) + expect(parent).to receive(:set_to_readonly) + expect(nx).to receive(:decr_switchover_with_parent) + + expect { nx.switchover_with_parent }.to hop("disable_logical_subscription") + end + end + + describe "#disable_logical_subscription" do + it "disables susbcription and hop" do + expect(lantern_resource).to receive(:disable_logical_subscription) + expect { nx.disable_logical_subscription }.to hop("sync_sequences_with_parent") + end + end + + describe "#sync_sequences_with_parent" do + it "syncs sequences and hop" do + expect(lantern_resource).to receive(:sync_sequences_with_parent) + expect { nx.sync_sequences_with_parent }.to hop("switch_dns_with_parent") + end + end + + describe "#switch_dns_with_parent" do + it "hops to wait_servers" do + parent = instance_double(LanternResource, representative_server: instance_double(LanternServer, domain: nil)) + expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) + expect(lantern_resource.parent.representative_server).to receive(:stop_container) + expect { nx.switch_dns_with_parent }.to hop("wait_servers") + end + + it "switches dns with parent and hop to wait_servers" do + parent = instance_double(LanternResource, representative_server: instance_double(LanternServer, domain: "test-domain")) + expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) + expect(lantern_resource.parent.representative_server).to receive(:stop_container) + expect(lantern_resource.representative_server).to receive(:swap_dns).with(parent.representative_server) + expect { nx.switch_dns_with_parent }.to hop("wait_servers") + end + end end diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index 640a1dc3b..5d65eb600 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -1024,7 +1024,7 @@ expect(current_master.vm.sshable).to receive(:cmd) expect(current_master).to receive(:incr_container_stopped) - expect { nx.take_over }.to hop("swap_ip") + expect { nx.take_over }.to hop("swap_dns") end it "swap ips" do @@ -1079,11 +1079,11 @@ image = "#{Config.gcr_image}:lantern-0.5.0-extras-0.5.0-minor-1" frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) - expect(lantern_server).to receive(:update).with(extras_version: "0.5.0", lantern_version: "0.5.0", minor_version: "1") - expect(lantern_server.resource).to receive(:update).with(pg_version: 17) expect(lantern_server).to receive(:container_image).and_return(image).at_least(:once) + expect(lantern_server.resource).to receive(:drop_ddl_log_trigger) expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer 'sudo lantern/bin/run_pg_upgrade' pg_upgrade", stdin: JSON.generate( container_image: lantern_server.container_image, + pg_version: 17, old_pg_version: lantern_server.resource.pg_version )) @@ -1107,7 +1107,6 @@ logs = {"stdout" => "", "stderr" => "error happened"} expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --logs pg_upgrade").and_return(JSON.generate(logs)) - expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean pg_upgrade") expect(Prog::PageNexus).to receive(:assemble_with_logs).with("Postgres update failed on #{lantern_server.resource.name} (#{lantern_server.resource.label})", [lantern_server.resource.ubid, lantern_server.ubid], logs, "critical", "LanternPGUpgradeFailed", lantern_server.ubid) expect { nx.wait_pg_upgrade }.to hop("wait") end @@ -1116,12 +1115,41 @@ frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check pg_upgrade").and_return("Succeeded") + expect(lantern_server).to receive(:update).with(extras_version: "0.5.0", lantern_version: "0.5.0", minor_version: "1") + expect(lantern_server.resource).to receive(:update).with(pg_version: 17) expect(frame).to receive(:delete).with("pg_upgrade") expect(nx.strand).to receive(:modified!).with(:stack) expect(nx.strand).to receive(:save_changes) expect(nx).to receive(:register_deadline).with(:wait, 40 * 60) - expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean pg_upgrade") expect { nx.wait_pg_upgrade }.to hop("init_sql") end end + + describe "#swap_dns" do + it "calls swap dns with representative_server" do + leader = instance_double(LanternServer) + expect(lantern_server.resource).to receive(:representative_server).and_return(leader) + expect(lantern_server).to receive(:swap_dns).with(leader) + expect { nx.swap_dns }.to hop("wait") + end + end + + describe "#wait_swap_dns" do + it "naps 10" do + expect(lantern_server).to receive(:is_dns_correct?).and_return(false) + expect { nx.wait_swap_dns }.to nap 5 + end + + it "naps 5" do + expect(lantern_server).to receive(:is_dns_correct?).and_return(true) + expect(lantern_server).to receive(:run_query).and_raise "test" + expect { nx.wait_swap_dns }.to nap 5 + end + + it "hops to promote" do + expect(lantern_server).to receive(:is_dns_correct?).and_return(true) + expect(lantern_server).to receive(:run_query).and_return("1") + expect { nx.wait_swap_dns }.to hop("promote_server") + end + end end diff --git a/spec/prog/lantern/lantern_timeline_nexus_spec.rb b/spec/prog/lantern/lantern_timeline_nexus_spec.rb index 5a7029ef5..9edc3f6d0 100644 --- a/spec/prog/lantern/lantern_timeline_nexus_spec.rb +++ b/spec/prog/lantern/lantern_timeline_nexus_spec.rb @@ -187,6 +187,7 @@ describe "#destroy" do it "naps for one month" do expect(nx).to receive(:when_destroy_set?).and_yield + expect(Config).to receive(:backup_retention_days_after_deletion).and_return(30) expect { nx.destroy }.to nap(60 * 60 * 24 * 30) end From 94281d6644d212e04ebfd649784645dd8b1f2f80 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Fri, 8 Nov 2024 19:39:01 +0400 Subject: [PATCH 06/13] improve dns switchover --- model/lantern/lantern_resource.rb | 2 +- model/lantern/lantern_server.rb | 13 +++++- prog/lantern/lantern_resource_nexus.rb | 30 +++++++++----- prog/lantern/lantern_server_nexus.rb | 7 +--- spec/model/lantern/lantern_resource_spec.rb | 4 +- spec/model/lantern/lantern_server_spec.rb | 27 ++++++++++++ .../lantern/lantern_resource_nexus_spec.rb | 41 +++++++++++++++---- .../prog/lantern/lantern_server_nexus_spec.rb | 12 +----- 8 files changed, 98 insertions(+), 38 deletions(-) diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index 800196663..a6bf79f9a 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -183,7 +183,7 @@ def sync_sequences_with_parent "SELECT setval('#{values[0]}.#{values[1]}', #{values[2]});" end - representative_server.run_query(statements, db: db) + representative_server.run_query(statements.join("\n"), db: db) end end diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index 690d82cc9..0a2d5a9f4 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -257,16 +257,27 @@ def autoresize_disk incr_update_storage_size end + def destroy_domain + cf_client = Dns::Cloudflare.new + cf_client.delete_dns_record(domain) + end + def swap_dns(other_server) strand.stack.first["domain"] = other_server.domain strand.modified!(:stack) strand.save_changes other_server.update(domain: nil) + + if domain + destroy_domain + update(domain: nil) + end + incr_add_domain end def is_dns_correct? - Resolv.getaddress(domain) == vm.sshable.host + domain && Resolv.getaddress(domain) == vm.sshable.host end def stop_container diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index 44e8b124b..e36c80543 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -230,14 +230,7 @@ def before_run nap 30 end - label def update_hosts - current_master = lantern_resource.parent.representative_server - current_master_domain = current_master.domain - new_master_domain = lantern_resource.representative_server.domain - - lantern_resource.representative_server.update(domain: current_master_domain) - current_master.update(domain: new_master_domain) - + label def finish_take_over # update display_states lantern_resource.update(display_state: nil) lantern_resource.parent.update(display_state: nil) @@ -248,6 +241,17 @@ def before_run hop_wait end + label def update_hosts + current_master = lantern_resource.parent.representative_server + current_master_domain = current_master.domain + new_master_domain = lantern_resource.representative_server.domain + + lantern_resource.representative_server.update(domain: current_master_domain) + current_master.update(domain: new_master_domain) + + hop_finish_take_over + end + label def wait_swap_ip ready = false begin @@ -290,13 +294,19 @@ def before_run label def switch_dns_with_parent lantern_resource.parent.representative_server.stop_container + lantern_resource.update(logical_replication: false) if lantern_resource.parent.representative_server.domain.nil? - hop_wait_servers + hop_finish_take_over end lantern_resource.representative_server.swap_dns(lantern_resource.parent.representative_server) - hop_wait_servers + hop_wait_switch_dns + end + + label def wait_switch_dns + nap 10 if !lantern_resource.representative_server.is_dns_correct? + hop_finish_take_over end label def destroy diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index dea8c0b60..c693a7f84 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -391,11 +391,6 @@ def before_run hop_setup_ssl end - def destroy_domain - cf_client = Dns::Cloudflare.new - cf_client.delete_dns_record(lantern_server.domain) - end - def add_domain_to_stack(domain) current_frame = strand.stack.first current_frame["domain"] = domain @@ -659,7 +654,7 @@ def remove_domain_from_stack strand.children.each { _1.destroy } if !lantern_server.domain.nil? - destroy_domain + lantern_server.destroy_domain end if lantern_server.primary? diff --git a/spec/model/lantern/lantern_resource_spec.rb b/spec/model/lantern/lantern_resource_spec.rb index e2a2e2c4d..527fb3332 100644 --- a/spec/model/lantern/lantern_resource_spec.rb +++ b/spec/model/lantern/lantern_resource_spec.rb @@ -312,8 +312,8 @@ ] statements_db2 = statements_db1 # identical statements for the test - expect(representative_server).to receive(:run_query).with(statements_db1, db: "db1") - expect(representative_server).to receive(:run_query).with(statements_db2, db: "db2") + expect(representative_server).to receive(:run_query).with(statements_db1.join("\n"), db: "db1") + expect(representative_server).to receive(:run_query).with(statements_db2.join("\n"), db: "db2") expect { lantern_resource.sync_sequences_with_parent }.not_to raise_error end diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index 6d4702736..adc4e040d 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -785,6 +785,23 @@ end describe "#swap_dns" do + it "swaps domains with another server and removes domain" do + frame = {} + other = instance_double(described_class) + strand = instance_double(Strand) + expect(lantern_server).to receive(:strand).and_return(strand).at_least(:once) + expect(lantern_server).to receive(:domain).and_return("old-domain").at_least(:once) + expect(lantern_server).to receive(:update).with(domain: nil) + expect(lantern_server.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.strand).to receive(:modified!).with(:stack) + expect(lantern_server.strand).to receive(:save_changes) + expect(lantern_server).to receive(:incr_add_domain) + expect(lantern_server).to receive(:destroy_domain) + expect(other).to receive(:domain).and_return("test") + expect(other).to receive(:update).with(domain: nil) + expect { lantern_server.swap_dns(other) }.not_to raise_error + end + it "swaps domains with another server" do frame = {} other = instance_double(described_class) @@ -829,4 +846,14 @@ expect { lantern_server.start_container }.not_to raise_error end end + + describe "#destroy_domain" do + it "destroys domain" do + cf_client = instance_double(Dns::Cloudflare) + expect(Dns::Cloudflare).to receive(:new).and_return(cf_client) + expect(lantern_server).to receive(:domain).and_return("example.com") + expect(cf_client).to receive(:delete_dns_record).with("example.com") + lantern_server.destroy_domain + end + end end diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index 74f799e08..864ca3ecf 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -359,11 +359,10 @@ end describe "#update_hosts" do - it "updates the domains of the current and new master, updates display states, and removes fork association" do + it "updates the domains of the current and new master" do parent = instance_double(LanternResource) current_master = instance_double(LanternServer, domain: "current-master-domain.com") new_master = instance_double(LanternServer, domain: "new-master-domain.com") - timeline = instance_double(LanternTimeline) expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) expect(parent).to receive(:representative_server).and_return(current_master).at_least(:once) @@ -371,6 +370,17 @@ expect(new_master).to receive(:update).with(domain: "current-master-domain.com") expect(current_master).to receive(:update).with(domain: "new-master-domain.com") + expect { nx.update_hosts }.to hop("finish_take_over") + end + end + + describe "#finish_take_over" do + it "updates display states, and removes fork association" do + parent = instance_double(LanternResource) + timeline = instance_double(LanternTimeline) + + expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) + expect(lantern_resource).to receive(:update).with(display_state: nil) expect(parent).to receive(:update).with(display_state: nil) @@ -378,7 +388,7 @@ expect(lantern_resource).to receive(:timeline).and_return(timeline) expect(timeline).to receive(:update).with(parent_id: nil) - expect { nx.update_hosts }.to hop("wait") + expect { nx.finish_take_over }.to hop("wait") end end @@ -408,19 +418,36 @@ end describe "#switch_dns_with_parent" do - it "hops to wait_servers" do + it "hops to finish_take_over" do parent = instance_double(LanternResource, representative_server: instance_double(LanternServer, domain: nil)) expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) expect(lantern_resource.parent.representative_server).to receive(:stop_container) - expect { nx.switch_dns_with_parent }.to hop("wait_servers") + expect { nx.switch_dns_with_parent }.to hop("finish_take_over") end - it "switches dns with parent and hop to wait_servers" do + it "switches dns with parent and hop to wait_switch_dns" do parent = instance_double(LanternResource, representative_server: instance_double(LanternServer, domain: "test-domain")) expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) expect(lantern_resource.parent.representative_server).to receive(:stop_container) expect(lantern_resource.representative_server).to receive(:swap_dns).with(parent.representative_server) - expect { nx.switch_dns_with_parent }.to hop("wait_servers") + expect(lantern_resource).to receive(:update).with(logical_replication: false) + expect { nx.switch_dns_with_parent }.to hop("wait_switch_dns") + end + end + + describe "#wait_switch_dns" do + it "naps if dns is not ready" do + representative_server = instance_double(LanternServer) + expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(representative_server).to receive(:is_dns_correct?).and_return(false) + expect { nx.wait_switch_dns }.to nap 10 + end + + it "hops to finish_take_over" do + representative_server = instance_double(LanternServer) + expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(representative_server).to receive(:is_dns_correct?).and_return(true) + expect { nx.wait_switch_dns }.to hop("finish_take_over") end end end diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index 5d65eb600..e90b69ec5 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -614,16 +614,6 @@ end end - describe "#destroy_domain" do - it "destroys domain" do - cf_client = instance_double(Dns::Cloudflare) - expect(Dns::Cloudflare).to receive(:new).and_return(cf_client) - expect(lantern_server).to receive(:domain).and_return("example.com") - expect(cf_client).to receive(:delete_dns_record).with("example.com") - nx.destroy_domain - end - end - describe "#add_domain_to_stack" do it "adds domain to current frame" do domain = "db.lantern.dev" @@ -831,7 +821,7 @@ expect(lantern_server).to receive(:primary?).and_return(true) expect(lantern_server.timeline).to receive(:incr_destroy).at_least(:once) expect(lantern_server).to receive(:domain).and_return("example.com") - expect(nx).to receive(:destroy_domain) + expect(lantern_server).to receive(:destroy_domain) expect(lantern_server).to receive(:destroy) expect { nx.destroy }.to exit({"msg" => "lantern server was deleted"}) end From fc6a25b06923ce3d44a037d3d2c9b507e2faaf5b Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Fri, 8 Nov 2024 20:25:35 +0400 Subject: [PATCH 07/13] reduce wait time when stopping container on switchover --- model/lantern/lantern_server.rb | 4 ++-- prog/lantern/lantern_resource_nexus.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index 0a2d5a9f4..b0f5dc809 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -280,8 +280,8 @@ def is_dns_correct? domain && Resolv.getaddress(domain) == vm.sshable.host end - def stop_container - vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} down -t 60 || true") + def stop_container(timeout=60) + vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} down -t #{timeout} || true") end def start_container diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index e36c80543..b3fe79d84 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -293,7 +293,7 @@ def before_run end label def switch_dns_with_parent - lantern_resource.parent.representative_server.stop_container + lantern_resource.parent.representative_server.stop_container(1) lantern_resource.update(logical_replication: false) if lantern_resource.parent.representative_server.domain.nil? From 7ec2605257525a82e930dbc5bb1694b67c62a980 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 11 Nov 2024 14:09:39 +0400 Subject: [PATCH 08/13] correctly drop replication slot and publications after replica destroyed --- model/lantern/lantern_resource.rb | 8 +++-- model/lantern/lantern_server.rb | 2 +- prog/lantern/lantern_resource_nexus.rb | 17 +++++++--- prog/lantern/lantern_server_nexus.rb | 7 +---- spec/model/lantern/lantern_resource_spec.rb | 27 ++++++++++------ .../lantern/lantern_resource_nexus_spec.rb | 31 +++++++++++++++---- .../prog/lantern/lantern_server_nexus_spec.rb | 3 -- 7 files changed, 64 insertions(+), 31 deletions(-) diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index a6bf79f9a..39a1fbce7 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -168,6 +168,10 @@ def create_publication(name) representative_server.run_query_all("CREATE PUBLICATION #{name} FOR ALL TABLES") end + def delete_publication(name) + representative_server.run_query_all("DROP PUBLICATION IF EXISTS #{name}") + end + def sync_sequences_with_parent representative_server.list_all_databases.each do |db| res = parent.representative_server.run_query(" @@ -207,8 +211,8 @@ def create_and_enable_subscription end end - def disable_logical_subscription - representative_server.run_query_all("ALTER SUBSCRIPTION sub_#{ubid} DISABLE") + def delete_logical_subscription(name) + representative_server.run_query_all("DROP SUBSCRIPTION IF EXISTS #{name}") end def create_logical_replica(lantern_version: nil, extras_version: nil, minor_version: nil, pg_upgrade: nil) diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index b0f5dc809..ff9be6bf4 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -280,7 +280,7 @@ def is_dns_correct? domain && Resolv.getaddress(domain) == vm.sshable.host end - def stop_container(timeout=60) + def stop_container(timeout = 60) vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} down -t #{timeout} || true") end diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index b3fe79d84..2e172d4a3 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -270,7 +270,7 @@ def before_run label def swap_leaders_with_parent decr_swap_leaders_with_parent lantern_resource.parent.set_to_readonly - lantern_resource.disable_logical_subscription + lantern_resource.delete_logical_subscription("sub_#{lantern_resource.ubid}") lantern_resource.sync_sequences_with_parent lantern_resource.representative_server.vm.swap_ip(lantern_resource.parent.representative_server.vm) hop_wait_swap_ip @@ -279,11 +279,11 @@ def before_run label def switchover_with_parent decr_switchover_with_parent lantern_resource.parent.set_to_readonly - hop_disable_logical_subscription + hop_delete_logical_subscription end - label def disable_logical_subscription - lantern_resource.disable_logical_subscription + label def delete_logical_subscription + lantern_resource.delete_logical_subscription("sub_#{lantern_resource.ubid}") hop_sync_sequences_with_parent end @@ -314,6 +314,15 @@ def before_run decr_destroy + if lantern_resource.parent + begin + lantern_resource.delete_logical_subscription("sub_#{lantern_resource.ubid}") + lantern_resource.parent.delete_publication("pub_#{lantern_resource.ubid}") + lantern_resource.parent.delete_replication_slot("slot_#{lantern_resource.ubid}") + rescue + end + end + strand.children.each { _1.destroy } unless servers.empty? servers.each(&:incr_destroy) diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index c693a7f84..0f74bc15b 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -172,7 +172,6 @@ def before_run nap 30 if lag.empty? || lag.to_i > 80 * 1024 * 1024 # 80 MB or ~5 WAL files lantern_server.update(synchronization_status: "ready") - lantern_server.resource.delete_replication_slot(lantern_server.ubid) if !lantern_server.domain && !lantern_server.resource.representative_server.domain.nil? add_domain_to_stack(lantern_server.resource.representative_server.domain) @@ -659,12 +658,8 @@ def remove_domain_from_stack if lantern_server.primary? lantern_server.timeline.incr_destroy - else - begin - lantern_server.resource.delete_replication_slot(lantern_server.ubid) - rescue - end end + lantern_server.destroy vm.incr_destroy diff --git a/spec/model/lantern/lantern_resource_spec.rb b/spec/model/lantern/lantern_resource_spec.rb index 527fb3332..81c41ca68 100644 --- a/spec/model/lantern/lantern_resource_spec.rb +++ b/spec/model/lantern/lantern_resource_spec.rb @@ -213,6 +213,24 @@ end end + describe "#delete_publication" do + it "drops replication slot" do + representative_server = instance_double(LanternServer) + expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(lantern_resource.representative_server).to receive(:run_query_all).with("DROP PUBLICATION IF EXISTS test") + expect { lantern_resource.delete_publication("test") }.not_to raise_error + end + end + + describe "#delete_logical_subscription" do + it "drops subscription" do + representative_server = instance_double(LanternServer) + expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(lantern_resource.representative_server).to receive(:run_query_all).with("DROP SUBSCRIPTION IF EXISTS test") + expect { lantern_resource.delete_logical_subscription("test") }.not_to raise_error + end + end + describe "#create_publication" do it "creates new publication" do representative_server = instance_double(LanternServer) @@ -235,15 +253,6 @@ end end - describe "#disable_logical_subscription" do - it "disables subscription" do - representative_server = instance_double(LanternServer) - expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) - expect(lantern_resource.representative_server).to receive(:run_query_all).with("ALTER SUBSCRIPTION sub_#{lantern_resource.ubid} DISABLE") - expect { lantern_resource.disable_logical_subscription }.not_to raise_error - end - end - describe "#create_logical_replica" do it "create logical replica with current version" do representative_server = instance_double(LanternServer, diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index 864ca3ecf..347035842 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -298,6 +298,25 @@ expect { nx.destroy }.to exit({"msg" => "lantern resource is deleted"}) end + it "deletes replication slot and publications on parent" do + expect(lantern_resource.servers).to all(receive(:incr_destroy)) + expect { nx.destroy }.to nap(5) + + parent_reosurce = instance_double(LanternResource) + expect(lantern_resource).to receive(:ubid).and_return("test-ubid").at_least(:once) + expect(parent_reosurce).to receive(:delete_replication_slot).with("slot_#{lantern_resource.ubid}") + expect(parent_reosurce).to receive(:delete_publication).with("pub_#{lantern_resource.ubid}") + expect(lantern_resource).to receive(:delete_logical_subscription).with("sub_#{lantern_resource.ubid}") + expect(lantern_resource).to receive(:parent).and_return(parent_reosurce).at_least(:once) + expect(lantern_resource).to receive(:servers).and_return([]) + expect(lantern_resource).to receive(:dissociate_with_project) + expect(lantern_resource).to receive(:destroy) + expect(lantern_resource).to receive(:doctor).and_return(nil) + expect(lantern_resource).to receive(:service_account_name).and_return(nil) + + expect { nx.destroy }.to exit({"msg" => "lantern resource is deleted"}) + end + it "triggers server deletion and deletes doctor" do expect(lantern_resource.servers).to all(receive(:incr_destroy)) expect { nx.destroy }.to nap(5) @@ -332,7 +351,7 @@ vm = instance_double(GcpVm) expect(parent).to receive(:representative_server).and_return(representative_server) expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) - expect(lantern_resource).to receive(:disable_logical_subscription) + expect(lantern_resource).to receive(:delete_logical_subscription).with("sub_#{lantern_resource.ubid}") expect(lantern_resource).to receive(:sync_sequences_with_parent) expect(representative_server).to receive(:vm).and_return(vm).at_least(:once) expect(vm).to receive(:swap_ip) @@ -399,14 +418,14 @@ expect(parent).to receive(:set_to_readonly) expect(nx).to receive(:decr_switchover_with_parent) - expect { nx.switchover_with_parent }.to hop("disable_logical_subscription") + expect { nx.switchover_with_parent }.to hop("delete_logical_subscription") end end - describe "#disable_logical_subscription" do - it "disables susbcription and hop" do - expect(lantern_resource).to receive(:disable_logical_subscription) - expect { nx.disable_logical_subscription }.to hop("sync_sequences_with_parent") + describe "#delete_logical_subscription" do + it "deletes susbcription and hop" do + expect(lantern_resource).to receive(:delete_logical_subscription).with("sub_#{lantern_resource.ubid}") + expect { nx.delete_logical_subscription }.to hop("sync_sequences_with_parent") end end diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index e90b69ec5..b26137048 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -328,7 +328,6 @@ expect(lantern_server).to receive(:update).with({synchronization_status: "ready"}) expect(lantern_server.resource).to receive(:representative_server).and_return(leader).at_least(:once) expect(lantern_server.resource).to receive(:ha_type).and_return(LanternResource::HaType::SYNC) - expect(lantern_server.resource).to receive(:delete_replication_slot).with(lantern_server.ubid) expect(leader).to receive(:run_query).and_return((1 * 1024 * 1024).to_s) expect { nx.wait_catch_up }.to hop("wait_synchronization") end @@ -338,7 +337,6 @@ expect(lantern_server).to receive(:update).with({synchronization_status: "ready"}) expect(lantern_server.resource).to receive(:representative_server).and_return(leader).at_least(:once) expect(lantern_server.resource).to receive(:ha_type).and_return(LanternResource::HaType::ASYNC) - expect(lantern_server.resource).to receive(:delete_replication_slot).with(lantern_server.ubid) expect(leader).to receive(:run_query).and_return((1 * 1024 * 1024).to_s) expect { nx.wait_catch_up }.to hop("wait") end @@ -812,7 +810,6 @@ expect(lantern_server).to receive(:primary?).and_return(false) expect(lantern_server).to receive(:domain).and_return(nil) expect(lantern_server).to receive(:destroy) - expect(lantern_server.resource).to receive(:delete_replication_slot).with(lantern_server.ubid) expect { nx.destroy }.to exit({"msg" => "lantern server was deleted"}) end From e9c929e8d84803aa5a779e641bd3903f2144b956 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 11 Nov 2024 15:31:15 +0400 Subject: [PATCH 09/13] wait for logical replication lag to be 0 before doing switchover --- model/lantern/lantern_resource.rb | 4 ++++ prog/lantern/lantern_resource_nexus.rb | 5 +++++ spec/model/lantern/lantern_resource_spec.rb | 9 +++++++++ .../lantern/lantern_resource_nexus_spec.rb | 20 ++++++++++++++++++- 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index 39a1fbce7..435f648d8 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -109,6 +109,10 @@ def delete_replication_slot(name) representative_server.run_query("SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name='#{name}';") end + def get_logical_replication_lag(slot_name) + representative_server.run_query("SELECT (pg_current_wal_lsn() - confirmed_flush_lsn) FROM pg_catalog.pg_replication_slots WHERE slot_name = '#{slot_name}'").chomp.to_i + end + def create_ddl_log commands = < Date: Mon, 11 Nov 2024 17:27:36 +0400 Subject: [PATCH 10/13] setup ssl on fork to not waste time on switchover --- model/lantern/lantern_server.rb | 14 +++++ prog/lantern/lantern_resource_nexus.rb | 6 ++ prog/lantern/lantern_server_nexus.rb | 28 +++------ rhizome/lantern/lib/common.rb | 37 +++++++---- spec/model/lantern/lantern_server_spec.rb | 28 +++++++++ .../lantern/lantern_resource_nexus_spec.rb | 13 ++++ .../prog/lantern/lantern_server_nexus_spec.rb | 63 +++++++++++-------- 7 files changed, 132 insertions(+), 57 deletions(-) diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index ff9be6bf4..3966d77a4 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -262,6 +262,20 @@ def destroy_domain cf_client.delete_dns_record(domain) end + def add_domain_to_stack(domain) + current_frame = strand.stack.first + current_frame["domain"] = domain + strand.modified!(:stack) + strand.save_changes + end + + def remove_domain_from_stack + current_frame = strand.stack.first + current_frame.delete("domain") + strand.modified!(:stack) + strand.save_changes + end + def swap_dns(other_server) strand.stack.first["domain"] = other_server.domain strand.modified!(:stack) diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index ba28cf6d3..5f00e93df 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -311,6 +311,12 @@ def before_run label def wait_switch_dns nap 10 if !lantern_resource.representative_server.is_dns_correct? + begin + connection = Sequel.connect(lantern_resource.connection_string) + connection["SELECT 1"].first + rescue + nap 10 + end hop_finish_take_over end diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index 0f74bc15b..2815d5df0 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -174,7 +174,7 @@ def before_run lantern_server.update(synchronization_status: "ready") if !lantern_server.domain && !lantern_server.resource.representative_server.domain.nil? - add_domain_to_stack(lantern_server.resource.representative_server.domain) + lantern_server.add_domain_to_stack(lantern_server.resource.representative_server.domain) incr_setup_ssl end @@ -227,6 +227,12 @@ def before_run incr_run_pg_upgrade end + if lantern_server.resource.logical_replication && !lantern_server.resource.parent.representative_server.domain.nil? + # prepare for fast switchover + lantern_server.add_domain_to_stack(lantern_server.resource.parent.representative_server.domain) + incr_setup_ssl + end + hop_wait_timeline_available end @@ -383,33 +389,19 @@ def before_run lantern_server.update(domain: frame["domain"]) - remove_domain_from_stack + lantern_server.remove_domain_from_stack decr_add_domain register_deadline(:wait, 5 * 60) hop_setup_ssl end - def add_domain_to_stack(domain) - current_frame = strand.stack.first - current_frame["domain"] = domain - strand.modified!(:stack) - strand.save_changes - end - - def remove_domain_from_stack - current_frame = strand.stack.first - current_frame.delete("domain") - strand.modified!(:stack) - strand.save_changes - end - label def setup_ssl case vm.sshable.cmd("common/bin/daemonizer --check setup_ssl") when "Succeeded" vm.sshable.cmd("common/bin/daemonizer --clean setup_ssl") decr_setup_ssl - remove_domain_from_stack + lantern_server.remove_domain_from_stack hop_wait_db_available when "NotStarted" vm.sshable.cmd("common/bin/daemonizer 'sudo lantern/bin/setup_ssl' setup_ssl", stdin: JSON.generate({ @@ -423,7 +415,7 @@ def remove_domain_from_stack Clog.emit("Lantern SSL Setup Failed for #{lantern_server.resource.name}") { {logs: logs, name: lantern_server.resource.name, lantern_server: lantern_server.id} } Prog::PageNexus.assemble_with_logs("Lantern SSL Setup Failed for #{lantern_server.resource.name}", [lantern_server.resource.ubid, lantern_server.ubid], logs, "error", "LanternSSLSetupFailed", lantern_server.ubid) vm.sshable.cmd("common/bin/daemonizer --clean setup_ssl") - remove_domain_from_stack + lantern_server.remove_domain_from_stack decr_setup_ssl hop_wait end diff --git a/rhizome/lantern/lib/common.rb b/rhizome/lantern/lib/common.rb index b3b192eca..83af0caa3 100755 --- a/rhizome/lantern/lib/common.rb +++ b/rhizome/lantern/lib/common.rb @@ -78,21 +78,34 @@ def append_env(env_arr) File.open($env_file, "w") { |f| combined_env.each { |key, value| f.puts "#{key}=#{value}" } } end +def tls_already_configured?(domain) + is_domain_configured = !r("test -f /root/.acme.sh/acme.sh && /root/.acme.sh/acme.sh --list -d #{domain}").chomp.empty? + + if !is_domain_configured + return false + end + + !r("(test -f #{$datadir}/server.key && test -f #{$datadir}/server.crt && echo 1) || echo ''").chomp.empty? +end + def configure_tls(domain, email, dns_token, dns_zone_id, provider) puts "Configuring TLS for domain #{domain}" - r "curl -s https://get.acme.sh | sh -s email=#{email}" - env = if provider == "dns_cf" - "CF_Token='#{dns_token}' CF_Zone_ID='#{dns_zone_id}'" - else - "GOOGLEDOMAINS_ACCESS_TOKEN='#{dns_token}'" - end - r "#{env} /root/.acme.sh/acme.sh --server letsencrypt --issue --dns #{provider} -d #{domain}" - reload_cmd = "sudo docker compose -f #{$compose_file} exec postgresql psql -U postgres -c 'SELECT pg_reload_conf()' && sudo docker compose -f #{$compose_file} exec postgresql psql -p6432 -U postgres pgbouncer -c RELOAD" - r "/root/.acme.sh/acme.sh --install-cert -d #{domain} --key-file #{$datadir}/server.key --fullchain-file #{$datadir}/server.crt --reloadcmd \"#{reload_cmd}\"" - r "sudo chown 1001:1001 #{$datadir}/server.key" - r "sudo chown 1001:1001 #{$datadir}/server.crt" - r "sudo chmod 600 #{$datadir}/server.key" + if !tls_already_configured? + r "curl -s https://get.acme.sh | sh -s email=#{email}" + env = if provider == "dns_cf" + "CF_Token='#{dns_token}' CF_Zone_ID='#{dns_zone_id}'" + else + "GOOGLEDOMAINS_ACCESS_TOKEN='#{dns_token}'" + end + + r "#{env} /root/.acme.sh/acme.sh --server letsencrypt --issue --dns #{provider} -d #{domain}" + reload_cmd = "sudo docker compose -f #{$compose_file} exec postgresql psql -U postgres -c 'SELECT pg_reload_conf()' && sudo docker compose -f #{$compose_file} exec postgresql psql -p6432 -U postgres pgbouncer -c RELOAD" + r "/root/.acme.sh/acme.sh --install-cert -d #{domain} --key-file #{$datadir}/server.key --fullchain-file #{$datadir}/server.crt --reloadcmd \"#{reload_cmd}\"" + r "sudo chown 1001:1001 #{$datadir}/server.key" + r "sudo chown 1001:1001 #{$datadir}/server.crt" + r "sudo chmod 600 #{$datadir}/server.key" + end append_env([ ["POSTGRESQL_ENABLE_TLS", "yes"], diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index adc4e040d..61f3d8843 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -856,4 +856,32 @@ lantern_server.destroy_domain end end + + describe "#add_domain_to_stack" do + it "adds domain to current frame" do + domain = "db.lantern.dev" + frame = {} + strand = instance_double(Strand) + expect(lantern_server).to receive(:strand).and_return(strand).at_least(:once) + expect(strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(frame).to receive(:[]=).with("domain", domain) + expect(strand).to receive(:modified!).with(:stack) + expect(strand).to receive(:save_changes) + expect { lantern_server.add_domain_to_stack(domain) }.not_to raise_error + end + end + + describe "#remove_domain_from_stack" do + it "removes domain from current frame" do + domain = "db.lantern.dev" + frame = {"domain" => domain} + strand = instance_double(Strand) + expect(lantern_server).to receive(:strand).and_return(strand).at_least(:once) + expect(strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(frame).to receive(:delete).with("domain") + expect(strand).to receive(:modified!).with(:stack) + expect(strand).to receive(:save_changes) + expect { lantern_server.remove_domain_from_stack }.not_to raise_error + end + end end diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index 3d88a062c..edc63ae70 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -480,8 +480,21 @@ expect { nx.wait_switch_dns }.to nap 10 end + it "waits if db is not ready" do + representative_server = instance_double(LanternServer) + expect(Sequel).to receive(:connect).and_return(DB) + expect(DB).to receive(:[]).with("SELECT 1").and_raise + expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(representative_server).to receive(:is_dns_correct?).and_return(true) + expect { nx.wait_switch_dns }.to nap 10 + end + it "hops to finish_take_over" do representative_server = instance_double(LanternServer) + expect(Sequel).to receive(:connect).and_return(DB) + res = instance_double(Sequel::Dataset) + expect(res).to receive(:first) + expect(DB).to receive(:[]).with("SELECT 1").and_return(res) expect(lantern_resource).to receive(:representative_server).and_return(representative_server).at_least(:once) expect(representative_server).to receive(:is_dns_correct?).and_return(true) expect { nx.wait_switch_dns }.to hop("finish_take_over") diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index b26137048..522bc5139 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -322,7 +322,7 @@ it "hops to wait_synchronization" do leader = instance_double(LanternServer, domain: "db.lantern.dev") - expect(nx).to receive(:add_domain_to_stack).with(leader.domain) + expect(lantern_server).to receive(:add_domain_to_stack).with(leader.domain) expect(nx).to receive(:incr_setup_ssl) expect(lantern_server).to receive(:domain).and_return(nil) expect(lantern_server).to receive(:update).with({synchronization_status: "ready"}) @@ -368,6 +368,7 @@ describe "#wait_recovery_completion" do it "hop to wait if recovery finished" do expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) + expect(lantern_server.resource).to receive(:logical_replication).and_return(false) expect(lantern_server).to receive(:run_query).and_return("t", "paused", "t", lantern_server.lantern_version, lantern_server.extras_version) expect(lantern_server).to receive(:timeline_id=) expect(lantern_server).to receive(:timeline_access=).with("push") @@ -378,6 +379,7 @@ it "hop to wait if not in recovery" do expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) + expect(lantern_server.resource).to receive(:logical_replication).and_return(false) expect(lantern_server).to receive(:run_query).and_return("f", lantern_server.lantern_version, lantern_server.extras_version) expect(lantern_server).to receive(:timeline_id=) expect(lantern_server).to receive(:timeline_access=).with("push") @@ -388,6 +390,7 @@ it "do not update extension on upgrade" do expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) + expect(lantern_server.resource).to receive(:logical_replication).and_return(false) expect(lantern_server).to receive(:run_query).and_return("f") expect(lantern_server).to receive(:timeline_id=) expect(lantern_server).to receive(:timeline_access=).with("push") @@ -398,6 +401,7 @@ end it "update extension on version mismatch" do + expect(lantern_server.resource).to receive(:logical_replication).and_return(false) expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) expect(lantern_server).to receive(:run_query).and_return("t", "paused", "t", "0.2.4", "0.1.4") expect(lantern_server).to receive(:timeline_id=) @@ -411,8 +415,36 @@ expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") end + it "does not setup ssl if parent has no domain" do + parent_reosurce = instance_double(LanternResource) + representative_server = instance_double(LanternServer) + expect(parent_reosurce).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(representative_server).to receive(:domain).and_return(nil).at_least(:once) + expect(lantern_server.resource).to receive(:parent).and_return(parent_reosurce).at_least(:once) + expect(lantern_server.resource).to receive(:logical_replication).and_return(true) + expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) + expect(lantern_server).to receive(:run_query).and_return("f") + expect(lantern_server).to receive(:timeline_id=) + expect(lantern_server).to receive(:timeline_access=).with("push") + expect(lantern_server).to receive(:save_changes) + frame = {"pg_upgrade" => {"lantern_version" => "0.5.0", "extras_version" => "0.5.0", "minor_version" => "1", "pg_version" => 17}} + expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(lantern_server.resource).to receive(:version_upgrade).and_return(true) + expect(Prog::Lantern::LanternTimelineNexus).to receive(:assemble).and_return(instance_double(Strand, id: "104b0033-b3f6-8214-ae27-0cd3cef18ce5")) + expect(nx).to receive(:incr_run_pg_upgrade) + expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") + end + it "run pg_upgrade if frame has pg_upgrade info" do + parent_reosurce = instance_double(LanternResource) + representative_server = instance_double(LanternServer) + expect(parent_reosurce).to receive(:representative_server).and_return(representative_server).at_least(:once) + expect(representative_server).to receive(:domain).and_return("example.com").at_least(:once) + expect(lantern_server.resource).to receive(:parent).and_return(parent_reosurce).at_least(:once) + expect(lantern_server.resource).to receive(:logical_replication).and_return(true) expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) + expect(lantern_server).to receive(:add_domain_to_stack).with(parent_reosurce.representative_server.domain) + expect(nx).to receive(:incr_setup_ssl) expect(lantern_server).to receive(:run_query).and_return("f") expect(lantern_server).to receive(:timeline_id=) expect(lantern_server).to receive(:timeline_access=).with("push") @@ -607,35 +639,12 @@ expect(nx).to receive(:frame).and_return({"domain" => "test.lantern.dev"}).at_least(:once) expect(lantern_server).to receive(:update).with({domain: "test.lantern.dev"}) + expect(lantern_server).to receive(:remove_domain_from_stack) expect(cf_client).to receive(:upsert_dns_record).with("test.lantern.dev", "1.1.1.1") expect { nx.add_domain }.to hop("setup_ssl") end end - describe "#add_domain_to_stack" do - it "adds domain to current frame" do - domain = "db.lantern.dev" - frame = {} - expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) - expect(frame).to receive(:[]=).with("domain", domain) - expect(nx.strand).to receive(:modified!).with(:stack) - expect(nx.strand).to receive(:save_changes) - expect { nx.add_domain_to_stack(domain) }.not_to raise_error - end - end - - describe "#remove_domain_from_stack" do - it "removes domain from current frame" do - domain = "db.lantern.dev" - frame = {"domain" => domain} - expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) - expect(frame).to receive(:delete).with("domain") - expect(nx.strand).to receive(:modified!).with(:stack) - expect(nx.strand).to receive(:save_changes) - expect { nx.remove_domain_from_stack }.not_to raise_error - end - end - describe "#setup_ssl" do it "calls setup ssl with domain from frame and naps" do expect(nx).to receive(:frame).and_return({"domain" => "db.lantern.dev"}) @@ -675,7 +684,7 @@ it "sets up ssl and hops to wait_db_available" do expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --check setup_ssl").and_return("Succeeded") expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean setup_ssl") - expect(nx).to receive(:remove_domain_from_stack) + expect(lantern_server).to receive(:remove_domain_from_stack) expect { nx.setup_ssl }.to hop("wait_db_available") end @@ -685,7 +694,7 @@ logs = {"stdout" => "", "stderr" => "oom"} expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --logs setup_ssl").and_return(JSON.generate(logs)) expect(lantern_server.vm.sshable).to receive(:cmd).with("common/bin/daemonizer --clean setup_ssl") - expect(nx).to receive(:remove_domain_from_stack) + expect(lantern_server).to receive(:remove_domain_from_stack) expect(Prog::PageNexus).to receive(:assemble_with_logs).with("Lantern SSL Setup Failed for test", [lantern_server.resource.ubid, lantern_server.ubid], logs, "error", "LanternSSLSetupFailed", lantern_server.ubid) expect { nx.setup_ssl }.to hop("wait") end From 394b1a114cd370e7537ba06a34434c2196196189 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 11 Nov 2024 17:51:42 +0400 Subject: [PATCH 11/13] fix ssl cert check issue --- rhizome/lantern/lib/common.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rhizome/lantern/lib/common.rb b/rhizome/lantern/lib/common.rb index 83af0caa3..2bbc11827 100755 --- a/rhizome/lantern/lib/common.rb +++ b/rhizome/lantern/lib/common.rb @@ -91,7 +91,7 @@ def tls_already_configured?(domain) def configure_tls(domain, email, dns_token, dns_zone_id, provider) puts "Configuring TLS for domain #{domain}" - if !tls_already_configured? + if !tls_already_configured?(domain) r "curl -s https://get.acme.sh | sh -s email=#{email}" env = if provider == "dns_cf" "CF_Token='#{dns_token}' CF_Zone_ID='#{dns_zone_id}'" From bf1aab5a6f78883997bae37f21724805dc7be2ff Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 11 Nov 2024 19:31:09 +0400 Subject: [PATCH 12/13] fixes and improvements on adding domain and ssl setup --- model/lantern/lantern_resource.rb | 7 +++++- model/lantern/lantern_server.rb | 16 +++++++------- prog/lantern/lantern_server_nexus.rb | 10 ++++----- rhizome/lantern/lib/common.rb | 2 +- routes/api/project/location/lantern.rb | 7 +----- routes/web/project/location/lantern.rb | 6 +---- spec/model/lantern/lantern_resource_spec.rb | 4 ++-- spec/model/lantern/lantern_server_spec.rb | 22 +++++++++++++++++++ .../prog/lantern/lantern_server_nexus_spec.rb | 4 ++-- .../api/project/location/lantern_spec.rb | 2 +- 10 files changed, 49 insertions(+), 31 deletions(-) diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index 435f648d8..9fb4fb15a 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require "uri" require_relative "../../model" class LanternResource < Sequel::Model @@ -159,6 +160,7 @@ def listen_ddl_log END; $$ LANGUAGE plpgsql; + DROP TRIGGER IF EXISTS execute_ddl_after_insert ON ddl_log; CREATE TRIGGER execute_ddl_after_insert AFTER INSERT ON ddl_log FOR EACH ROW @@ -197,9 +199,12 @@ def sync_sequences_with_parent def create_and_enable_subscription representative_server.list_all_databases.each do |db| + uri = URI.parse(parent.connection_string(port: 5432)) + new_query_ar = URI.decode_www_form(String(uri.query)) << ["dbname", db] + uri.query = URI.encode_www_form(new_query_ar) commands = < domain} + strand = instance_double(Strand) + expect(strand).to receive(:stack).and_return([frame]).at_least(:once) + expect(frame).to receive(:delete).with("domain") + expect(strand).to receive(:modified!).with(:stack) + expect(strand).to receive(:save_changes) + expect { lantern_server.remove_domain_from_stack(strand) }.not_to raise_error + end end end diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index 522bc5139..b7cfe5220 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -322,7 +322,7 @@ it "hops to wait_synchronization" do leader = instance_double(LanternServer, domain: "db.lantern.dev") - expect(lantern_server).to receive(:add_domain_to_stack).with(leader.domain) + expect(lantern_server).to receive(:add_domain_to_stack).with(leader.domain, nx.strand) expect(nx).to receive(:incr_setup_ssl) expect(lantern_server).to receive(:domain).and_return(nil) expect(lantern_server).to receive(:update).with({synchronization_status: "ready"}) @@ -443,7 +443,7 @@ expect(lantern_server.resource).to receive(:parent).and_return(parent_reosurce).at_least(:once) expect(lantern_server.resource).to receive(:logical_replication).and_return(true) expect(lantern_server.resource).to receive(:allow_timeline_access_to_bucket) - expect(lantern_server).to receive(:add_domain_to_stack).with(parent_reosurce.representative_server.domain) + expect(lantern_server).to receive(:add_domain_to_stack).with(parent_reosurce.representative_server.domain, nx.strand) expect(nx).to receive(:incr_setup_ssl) expect(lantern_server).to receive(:run_query).and_return("f") expect(lantern_server).to receive(:timeline_id=) diff --git a/spec/routes/api/project/location/lantern_spec.rb b/spec/routes/api/project/location/lantern_spec.rb index fc76c40d1..2a1a009d0 100644 --- a/spec/routes/api/project/location/lantern_spec.rb +++ b/spec/routes/api/project/location/lantern_spec.rb @@ -216,7 +216,7 @@ it "adds domain" do post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/add-domain", {domain: "example.com"} server = LanternServer.where(id: pg.representative_server.id).first - expect(server.domain).to eq("example.com") + expect(server.strand.stack.first["domain"]).to eq("example.com") expect(last_response.status).to eq(200) end end From 31f168ffbde3a7e9466c9df23620f5c57d05101c Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 12 Nov 2024 17:05:20 +0400 Subject: [PATCH 13/13] add rollback switchover functionality in misc operations --- misc/misc_operations.rb | 25 +++++++++++++++++++ prog/lantern/lantern_server_nexus.rb | 8 +----- .../prog/lantern/lantern_server_nexus_spec.rb | 9 ------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/misc/misc_operations.rb b/misc/misc_operations.rb index 5b17ef44c..4cebf2eaf 100644 --- a/misc/misc_operations.rb +++ b/misc/misc_operations.rb @@ -240,4 +240,29 @@ def self.create_image(lantern_version: "0.2.7", extras_version: "0.1.5", minor_v puts "Image created" vm.incr_destroy end + + def self.rollback_switchover(current_resource, old_resource) + # stop current one and start old one + begin + current_resource.representative_server.stop_container(1) + rescue + end + + old_resource.representative_server.start_container + + # update dns + cf_client = Dns::Cloudflare.new + cf_client.upsert_dns_record(current_resource.representative_server.domain, old_resource.representative_server.vm.sshable.host) + old_resource.representative_server.update(domain: current_resource.representative_server.domain) + current_resource.representative_server.update(domain: nil) + + # disable readonly as soon as it is started + loop do + old_resource.representative_server.run_query("SELECT 1") + old_resource.set_to_readonly(status: "off") + break + rescue + sleep 10 + end + end end diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index fa8829ddb..313694955 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -379,13 +379,7 @@ def before_run end cf_client = Dns::Cloudflare.new - begin - cf_client.upsert_dns_record(frame["domain"], lantern_server.vm.sshable.host) - rescue => e - Clog.emit("Error while adding domain") { {error: e} } - decr_add_domain - hop_wait - end + cf_client.upsert_dns_record(frame["domain"], lantern_server.vm.sshable.host) lantern_server.update(domain: frame["domain"]) diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index b7cfe5220..0487788d1 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -623,15 +623,6 @@ expect { nx.add_domain }.to raise_error "no domain in stack" end - it "fails to add domain" do - expect(nx).to receive(:frame).and_return({"domain" => "db.lantern.dev"}).at_least(:once) - expect(lantern_server.vm.sshable).to receive(:host).and_return("1.1.1.1") - cf_client = instance_double(Dns::Cloudflare) - expect(Dns::Cloudflare).to receive(:new).and_return(cf_client) - expect(cf_client).to receive(:upsert_dns_record).and_raise - expect { nx.add_domain }.to hop("wait") - end - it "adds domain and setup ssl" do expect(lantern_server.vm.sshable).to receive(:host).and_return("1.1.1.1") cf_client = instance_double(Dns::Cloudflare)