diff --git a/config.rb b/config.rb index 7b2cf75f3..b8e15f461 100644 --- a/config.rb +++ b/config.rb @@ -154,6 +154,7 @@ def self.e2e_test? override :e2e_test, "0" override :backup_retention_days, 7, int override :lantern_log_dataset, "lantern_logs", string + override :compose_file, "/var/lib/lantern/docker-compose.yaml", string # Cloudflare optional :cf_token, string diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index c6fc64df6..a60eee885 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -192,6 +192,10 @@ def disable_logical_subscription end def create_logical_replica(lantern_version: nil, extras_version: nil, minor_version: nil) + # TODO:: + # 1. If new database will be created during logical replication it won't be added automatically + # 2. New timeline will be generated for lantern resource + # 3. We need rollback mechanism (basically that will be ip swap again) ubid = LanternResource.generate_ubid create_ddl_log create_publication("pub_#{ubid}") diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index b42b1aa9c..dc6300998 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -17,7 +17,7 @@ class LanternServer < Sequel::Model include SemaphoreMethods semaphore :initial_provisioning, :update_user_password, :update_lantern_extension, :update_extras_extension, :update_image, :setup_ssl, :add_domain, :update_rhizome, :checkup - semaphore :start_server, :stop_server, :restart_server, :take_over, :destroy, :update_storage_size, :update_vm_size, :update_memory_limits, :init_sql, :restart + semaphore :start_server, :stop_server, :restart_server, :take_over, :destroy, :update_storage_size, :update_vm_size, :update_memory_limits, :init_sql, :restart, :container_stopped def self.ubid_to_name(id) id.to_s[0..7] @@ -48,7 +48,7 @@ def connection_string(port: 6432) end def run_query(query, db: "postgres", user: "postgres") - vm.sshable.cmd("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec -T postgresql psql -q -U #{user} -t --csv #{db}", stdin: query).chomp + vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec -T postgresql psql -q -U #{user} -t --csv #{db}", stdin: query).chomp end def run_query_all(query) @@ -57,7 +57,7 @@ def run_query_all(query) def display_state return "deleting" if destroy_set? || strand.label == "destroy" - return "stopped" if vm.display_state == "stopped" + return "stopped" if vm.display_state == "stopped" || strand.label == "container_stopped" return "stopping" if vm.display_state == "stopping" return "starting" if vm.display_state == "starting" return "failed" if vm.display_state == "failed" @@ -132,14 +132,15 @@ def configure_hash }) end - def change_replication_mode(replication_mode, lazy: true) + def change_replication_mode(replication_mode, update_env: true) update(timeline_access: (replication_mode == "master") ? "push" : "fetch", representative_at: (replication_mode == "master") ? Time.new : nil) - cmd = lazy ? "lazy_update_env" : "update_env" - vm.sshable.cmd("sudo lantern/bin/#{cmd}", stdin: JSON.generate([ - ["POSTGRESQL_REPLICATION_MODE", replication_mode], - ["INSTANCE_TYPE", (replication_mode == "master") ? "writer" : "reader"], - ["POSTGRESQL_RECOVER_FROM_BACKUP", ""] - ])) + if update_env + vm.sshable.cmd("sudo lantern/bin/update_env", stdin: JSON.generate([ + ["POSTGRESQL_REPLICATION_MODE", replication_mode], + ["INSTANCE_TYPE", (replication_mode == "master") ? "writer" : "reader"], + ["POSTGRESQL_RECOVER_FROM_BACKUP", ""] + ])) + end end def update_walg_creds @@ -219,7 +220,7 @@ def prewarm_indexes_query end def list_all_databases - vm.sshable.cmd("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec postgresql psql -U postgres -P \"footer=off\" -c 'SELECT datname from pg_database' | tail -n +3 | grep -v 'template0' | grep -v 'template1'") + vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec postgresql psql -U postgres -P \"footer=off\" -c 'SELECT datname from pg_database' | tail -n +3 | grep -v 'template0' | grep -v 'template1'") .chomp .strip .split("\n") diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index 2166bbb3d..7cea3030f 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -11,7 +11,7 @@ class Prog::Lantern::LanternServerNexus < Prog::Base def_delegators :lantern_server, :vm semaphore :initial_provisioning, :update_user_password, :update_lantern_extension, :update_extras_extension, :update_image, :add_domain, :update_rhizome, :checkup - semaphore :start_server, :stop_server, :restart_server, :take_over, :destroy, :update_storage_size, :update_vm_size, :update_memory_limits, :init_sql, :restart + semaphore :start_server, :stop_server, :restart_server, :take_over, :destroy, :update_storage_size, :update_vm_size, :update_memory_limits, :init_sql, :restart, :container_stopped def self.assemble( resource_id: nil, lantern_version: "0.2.2", extras_version: "0.1.4", minor_version: "1", domain: nil, @@ -446,9 +446,23 @@ def destroy_domain hop_take_over end + when_container_stopped_set? do + hop_container_stopped + end + nap 30 end + label def container_stopped + decr_container_stopped + when_take_over_set? do + vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} up -d") + hop_take_over + end + + nap 15 + end + label def promote_server current_master = lantern_server.resource.representative_server current_master_domain = current_master.domain @@ -458,9 +472,11 @@ def destroy_domain current_master.update(domain: new_master_domain) lantern_server.run_query("SELECT pg_promote(true, 120);") - lantern_server.resource.set_to_readonly(status: "off") - current_master.change_replication_mode("slave") - lantern_server.change_replication_mode("master", lazy: false) + # we will mark the old server as slave, + # but don't change the docker env, so in case of emergency + # we could rollback to that instance + current_master.change_replication_mode("slave", update_env: false) + lantern_server.change_replication_mode("master") hop_wait end @@ -468,8 +484,7 @@ def destroy_domain label def wait_swap_ip # wait until ip change will propogate begin - is_in_recovery = lantern_server.run_query("SELECT pg_is_in_recovery()").chomp == "t" - nap 5 if !is_in_recovery + lantern_server.run_query("SELECT 1") rescue nap 5 end @@ -483,9 +498,15 @@ def destroy_domain hop_wait end - lantern_server.resource.set_to_readonly(status: "on") - lantern_server.vm.swap_ip(lantern_server.resource.representative_server.vm) + lantern_server.resource.representative_server.vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} down -t 60") + # put the old server in container_stopped mode, so no healthcheck will be done + lantern_server.resource.representative_server.incr_container_stopped + hop_swap_ip + end + + label def swap_ip + lantern_server.vm.swap_ip(lantern_server.resource.representative_server.vm) register_deadline(:promote_server, 5 * 60) hop_wait_swap_ip end diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index bca99b032..84bfa4549 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -110,6 +110,12 @@ expect(lantern_server.display_state).to eq("stopped") end + it "shows stopped (container)" do + expect(lantern_server.vm).to receive(:display_state).and_return("running").at_least(:once) + expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "container_stopped")).at_least(:once) + expect(lantern_server.display_state).to eq("stopped") + end + it "shows failed" do expect(lantern_server.vm).to receive(:display_state).and_return("failed").at_least(:once) expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) @@ -709,6 +715,13 @@ end describe "#change_replication_mode" do + it "changes to master without env" do + time = Time.new + expect(Time).to receive(:new).and_return(time) + expect(lantern_server).to receive(:update).with(timeline_access: "push", representative_at: time) + lantern_server.change_replication_mode("master", update_env: false) + end + it "changes to master" do time = Time.new expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo lantern/bin/update_env", stdin: JSON.generate([ @@ -718,11 +731,11 @@ ])) expect(Time).to receive(:new).and_return(time) expect(lantern_server).to receive(:update).with(timeline_access: "push", representative_at: time) - lantern_server.change_replication_mode("master", lazy: false) + lantern_server.change_replication_mode("master", update_env: true) end it "changes to slave" do - expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo lantern/bin/lazy_update_env", stdin: JSON.generate([ + expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo lantern/bin/update_env", stdin: JSON.generate([ ["POSTGRESQL_REPLICATION_MODE", "slave"], ["INSTANCE_TYPE", "reader"], ["POSTGRESQL_RECOVER_FROM_BACKUP", ""] diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index 41007ea6e..b6afde820 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -708,6 +708,11 @@ expect { nx.wait }.to hop("take_over") end + it "hops to container_stopped" do + nx.incr_container_stopped + expect { nx.wait }.to hop("container_stopped") + end + it "decrements checkup" do nx.incr_checkup expect(nx).to receive(:available?).and_return(true) @@ -923,34 +928,37 @@ describe "#take_over" do it "returns if primary" do expect(lantern_server).to receive(:standby?).and_return(false) - expect(lantern_server).not_to receive(:run_query) expect { nx.take_over }.to hop("wait") end - it "swap ips" do + it "stop old master" do expect(lantern_server).to receive(:standby?).and_return(true) current_master = instance_double(LanternServer, domain: "db1.lantern.dev", vm: instance_double(GcpVm, sshable: instance_double(Sshable, host: "127.0.0.1"), name: "old-master", location: "us-east1", address_name: "old-addr")) expect(lantern_server.resource).to receive(:representative_server).and_return(current_master).at_least(:once) - expect(lantern_server.vm).to receive(:swap_ip).with(current_master.vm) - expect(lantern_server.resource).to receive(:set_to_readonly).with(status: "on") + expect(current_master.vm.sshable).to receive(:cmd) + expect(current_master).to receive(:incr_container_stopped) - expect { nx.take_over }.to hop("wait_swap_ip") + expect { nx.take_over }.to hop("swap_ip") end - it "waits until vm available" do - expect(lantern_server).to receive(:run_query).with("SELECT pg_is_in_recovery()").and_raise "test" - expect { nx.wait_swap_ip }.to nap 5 + it "swap ips" do + current_master = instance_double(LanternServer, domain: "db1.lantern.dev", vm: instance_double(GcpVm, sshable: instance_double(Sshable, host: "127.0.0.1"), name: "old-master", location: "us-east1", address_name: "old-addr")) + expect(lantern_server.resource).to receive(:representative_server).and_return(current_master).at_least(:once) + + expect(lantern_server.vm).to receive(:swap_ip).with(current_master.vm) + + expect { nx.swap_ip }.to hop("wait_swap_ip") end - it "waits until ip swap done" do - expect(lantern_server).to receive(:run_query).with("SELECT pg_is_in_recovery()").and_return("f") + it "waits until vm available" do + expect(lantern_server).to receive(:run_query).with("SELECT 1").and_raise "test" expect { nx.wait_swap_ip }.to nap 5 end it "hops to promote" do - expect(lantern_server).to receive(:run_query).with("SELECT pg_is_in_recovery()").and_return("t") + expect(lantern_server).to receive(:run_query).with("SELECT 1") expect { nx.wait_swap_ip }.to hop("promote_server") end @@ -958,14 +966,25 @@ current_master = instance_double(LanternServer, domain: "db1.lantern.dev", vm: instance_double(GcpVm, sshable: instance_double(Sshable, host: "127.0.0.1"), name: "old-master", location: "us-east1", address_name: "old-addr")) expect(lantern_server.resource).to receive(:representative_server).and_return(current_master).at_least(:once) - expect(lantern_server.resource).to receive(:set_to_readonly).with(status: "off") expect(current_master).to receive(:update).with(domain: lantern_server.domain).at_least(:once) expect(lantern_server).to receive(:update).with(domain: current_master.domain).at_least(:once) expect(lantern_server).to receive(:run_query).with("SELECT pg_promote(true, 120);") - expect(current_master).to receive(:change_replication_mode).with("slave") - expect(lantern_server).to receive(:change_replication_mode).with("master", lazy: false) + expect(current_master).to receive(:change_replication_mode).with("slave", update_env: false) + expect(lantern_server).to receive(:change_replication_mode).with("master") expect { nx.promote_server }.to hop("wait") end end + + describe "#container_stopped" do + it "hops to take_over" do + nx.incr_take_over + expect(lantern_server.vm.sshable).to receive(:cmd) + expect { nx.container_stopped }.to hop("take_over") + end + + it "naps 15" do + expect { nx.container_stopped }.to nap(15) + end + end end