From b312eefad306814dc08b0570330394dcf26a0560 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Thu, 2 May 2024 15:54:31 -0700 Subject: [PATCH] Fix lantern server display status based on vm status, fix deadlines, dont check pulse if not running --- lib/option.rb | 6 +- ...20240502_lantern_resource_display_state.rb | 9 +++ model/gcp_vm.rb | 5 -- model/lantern/lantern_resource.rb | 2 +- model/lantern/lantern_server.rb | 10 +++ prog/gcp_vm/nexus.rb | 33 +++++++--- prog/lantern/lantern_resource_nexus.rb | 11 +++- prog/lantern/lantern_server_nexus.rb | 65 +++++++++++++++---- prog/lantern/lantern_timeline_nexus.rb | 8 +-- routes/web/project/location/lantern.rb | 3 - spec/model/gcp_vm_spec.rb | 5 -- spec/model/lantern/lantern_server_spec.rb | 50 ++++++++++++++ spec/prog/gcp_vm/nexus_spec.rb | 30 +++++++-- .../lantern/lantern_resource_nexus_spec.rb | 22 +++++++ .../prog/lantern/lantern_server_nexus_spec.rb | 62 ++++++++++++++---- .../lantern/lantern_timeline_nexus_spec.rb | 17 +---- 16 files changed, 255 insertions(+), 83 deletions(-) create mode 100644 migrate/20240502_lantern_resource_display_state.rb diff --git a/lib/option.rb b/lib/option.rb index 48446655f..7c1f72cfa 100644 --- a/lib/option.rb +++ b/lib/option.rb @@ -30,7 +30,7 @@ def self.lantern_locations_for_provider(provider) VmSize = Struct.new(:name, :family, :vcpu, :memory, :storage_size_gib) do alias_method :display_name, :name end - VmSizes = [2, 4, 8, 16, 32, 64].map { + VmSizes = [1, 2, 4, 8, 16, 32, 64].map { VmSize.new("n1-standard-#{_1}", "n1-standard", _1, _1 * 4, (_1 / 2) * 25) }.freeze @@ -38,8 +38,8 @@ def self.lantern_locations_for_provider(provider) alias_method :display_name, :name end - LanternSizes = [2, 4, 8, 16, 32, 64].map { - LanternSize.new("n1-standard-#{_1}", "n1-standard-#{_1}", "n1-standard", _1, _1 * 4, (_1 / 2) * 128) + LanternSizes = [1, 2, 4, 8, 16, 32, 64].map { + LanternSize.new("n1-standard-#{_1}", "n1-standard-#{_1}", "n1-standard", _1, _1 * 4, _1 * 64) }.freeze LanternHaOption = Struct.new(:name, :standby_count, :title, :explanation) diff --git a/migrate/20240502_lantern_resource_display_state.rb b/migrate/20240502_lantern_resource_display_state.rb new file mode 100644 index 000000000..99442370d --- /dev/null +++ b/migrate/20240502_lantern_resource_display_state.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +Sequel.migration do + change do + alter_table(:lantern_resource) do + add_column :display_state, :text, null: true + end + end +end diff --git a/model/gcp_vm.rb b/model/gcp_vm.rb index a887640f8..dcbf25d3b 100644 --- a/model/gcp_vm.rb +++ b/model/gcp_vm.rb @@ -31,11 +31,6 @@ def host sshable&.host end - def display_state - return "deleting" if destroy_set? - super - end - def mem_gib_ratio return 3.2 if arch == "arm64" 8 diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index cfe14edf9..423240c35 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -40,7 +40,7 @@ def path end def display_state - representative_server&.display_state || "unavailable" + super || representative_server&.display_state || "unavailable" end def connection_string diff --git a/model/lantern/lantern_server.rb b/model/lantern/lantern_server.rb index 0c1ba544f..924098055 100644 --- a/model/lantern/lantern_server.rb +++ b/model/lantern/lantern_server.rb @@ -59,6 +59,11 @@ def display_state return "domain setup" if strand.label.include?("domain") return "ssl setup" if strand.label.include?("setup_ssl") return "updating" if strand.label.include?("update") + return "updating" if strand.label.include?("init_sql") + return "stopped" if vm.display_state.include?("stopped") + return "stopping" if vm.display_state.include?("stopping") + return "starting" if vm.display_state.include?("starting") + return "failed" if vm.display_state.include?("failed") return "unavailable" if strand.label.include?("wait_db_available") return "running" if ["wait"].include?(strand.label) return "deleting" if destroy_set? || strand.label == "destroy" @@ -147,6 +152,11 @@ def init_health_monitor_session end def check_pulse(session:, previous_pulse:) + if display_state != "running" + # if there's an operation ongoing, do not check the pulse + return previous_pulse + end + reading = begin session[:db_connection] ||= Sequel.connect(connection_string) lsn_function = primary? ? "pg_current_wal_lsn()" : "pg_last_wal_receive_lsn()" diff --git a/prog/gcp_vm/nexus.rb b/prog/gcp_vm/nexus.rb index a26daa482..a8253bd11 100644 --- a/prog/gcp_vm/nexus.rb +++ b/prog/gcp_vm/nexus.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require "forwardable" require "netaddr" require "json" require "shellwords" @@ -10,6 +11,10 @@ class Prog::GcpVm::Nexus < Prog::Base subject_is :gcp_vm + + extend Forwardable + def_delegators :gcp_vm + semaphore :destroy, :start_vm, :stop_vm, :update_storage, :update_size def self.assemble(public_key, project_id, name: nil, size: "n1-standard-2", @@ -85,10 +90,14 @@ def host end label def start + register_deadline(:failed_provisioning, 10 * 60) + hop_create_vm + end + + label def create_vm gcp_client = Hosting::GcpApis.new labels = frame["labels"] gcp_client.create_vm(gcp_vm.name, "#{gcp_vm.location}-a", gcp_vm.boot_image, gcp_vm.public_key, gcp_vm.unix_user, "#{gcp_vm.family}-#{gcp_vm.cores}", gcp_vm.storage_size_gib, labels: labels) - register_deadline(:wait, 10 * 60) # remove labels from stack current_frame = strand.stack.first @@ -99,6 +108,11 @@ def host hop_wait_create_vm end + label def failed_provisioning + gcp_vm.update(display_state: "failed") + hop_wait + end + label def wait_sshable addr = gcp_vm.sshable.host @@ -123,23 +137,30 @@ def host label def wait when_stop_vm_set? do register_deadline(:wait, 5 * 60) + gcp_vm.update(display_state: "stopping") hop_stop_vm end when_start_vm_set? do register_deadline(:wait, 5 * 60) + gcp_vm.update(display_state: "starting") hop_start_vm end when_destroy_set? do + gcp_vm.update(display_state: "deleting") hop_destroy end when_update_size_set? do + register_deadline(:wait, 5 * 60) + gcp_vm.update(display_state: "updating") hop_update_size end when_update_storage_set? do + register_deadline(:wait, 5 * 60) + gcp_vm.update(display_state: "updating") hop_update_storage end @@ -157,8 +178,6 @@ def host end label def stop_vm - gcp_vm.update(display_state: "stopping") - gcp_client = Hosting::GcpApis.new gcp_client.stop_vm(gcp_vm.name, "#{gcp_vm.location}-a") @@ -168,13 +187,9 @@ def host end label def start_vm - gcp_vm.update(display_state: "starting") - gcp_client = Hosting::GcpApis.new gcp_client.start_vm(gcp_vm.name, "#{gcp_vm.location}-a") - gcp_vm.update(display_state: "running") - decr_start_vm hop_wait_sshable @@ -185,7 +200,6 @@ def host hop_stop_vm end decr_update_storage - register_deadline(:wait, 5 * 60) gcp_vm.update(display_state: "updating") gcp_client = Hosting::GcpApis.new zone = "#{gcp_vm.location}-a" @@ -205,12 +219,12 @@ def host hop_stop_vm end decr_update_size - register_deadline(:wait, 5 * 60) gcp_vm.update(display_state: "updating") gcp_client = Hosting::GcpApis.new gcp_client.update_vm_type(gcp_vm.name, "#{gcp_vm.location}-a", gcp_vm.display_size) when_update_storage_set? do + register_deadline(:wait, 5 * 60) hop_update_storage end @@ -219,7 +233,6 @@ def host label def destroy DB.transaction do - gcp_vm.update(display_state: "deleting") gcp_client = Hosting::GcpApis.new gcp_client.delete_vm(gcp_vm.name, "#{gcp_vm.location}-a") if gcp_vm.has_static_ipv4 diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index 02c7243c9..1a9c97ee3 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -112,13 +112,18 @@ def before_run label def start nap 5 unless representative_server.vm.strand.label == "wait" - register_deadline(:wait, 10 * 60) + register_deadline(:failed_provisioning, 10 * 60) # bud self.class, frame, :trigger_pg_current_xact_id_on_parent if lantern_resource.parent # hop_wait_trigger_pg_current_xact_id_on_parent hop_wait_servers end + label def failed_provisioning + lantern_resource.update(display_state: "failed") + hop_wait + end + # TODO:: check why is this needed # label def trigger_pg_current_xact_id_on_parent # lantern_resource.parent.representative_server.run_query("SELECT pg_current_xact_id()") @@ -142,6 +147,10 @@ def before_run Prog::Lantern::LanternServerNexus.assemble(resource_id: lantern_resource.id, timeline_id: lantern_resource.timeline.id, timeline_access: "fetch") end + if lantern_resource.display_state == "failed" && servers.any? { _1.strand.label == "wait" } + lantern_resource.update(display_state: nil) + end + nap 30 end diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index d729ab390..2412dbb6a 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -111,7 +111,10 @@ def before_run label def wait_bootstrap_rhizome reap - hop_setup_docker_stack if leaf? + if leaf? + register_deadline(:wait, 10 * 60) + hop_setup_docker_stack + end donate end @@ -120,8 +123,6 @@ def before_run raise "GCP_CREDS_GCR_B64 is required to setup docker stack for Lantern" end - register_deadline(:wait, 10 * 60) - case vm.sshable.cmd("common/bin/daemonizer --check configure_lantern") when "Succeeded" vm.sshable.cmd("common/bin/daemonizer --clean configure_lantern") @@ -137,8 +138,6 @@ def before_run end label def init_sql - register_deadline(:wait, 40 * 60) - case vm.sshable.cmd("common/bin/daemonizer --check init_sql") when "Succeeded" vm.sshable.cmd("common/bin/daemonizer --clean init_sql") @@ -190,21 +189,42 @@ def before_run lantern_server.timeline_access = "push" lantern_server.save_changes - lantern_server.update_walg_creds + lantern_version = lantern_server.run_query("SELECT extversion FROM pg_extension WHERE extname='lantern'") + extras_version = lantern_server.run_query("SELECT extversion FROM pg_extension WHERE extname='lantern_extras'") - hop_wait + if lantern_version != lantern_server.lantern_version + incr_update_lantern_extension + lantern_server.update(lantern_version: lantern_version) + end + + if extras_version != lantern_server.extras_version + incr_update_extras_extension + lantern_server.update(extras_version: extras_version) + end + + hop_wait_timeline_available end nap 5 end + label def wait_timeline_available + nap 10 if lantern_server.timeline.strand.label != "wait_leader" + lantern_server.update_walg_creds + decr_initial_provisioning + hop_wait_db_available + end + label def wait_db_available nap 10 if !available? when_initial_provisioning_set? do decr_initial_provisioning - hop_init_sql if lantern_server.primary? + if lantern_server.primary? + register_deadline(:wait, 40 * 60) + hop_init_sql + end hop_wait_catch_up if lantern_server.standby? hop_wait_recovery_completion end @@ -222,6 +242,7 @@ def before_run when "Succeeded" vm.sshable.cmd("common/bin/daemonizer --clean update_lantern") decr_update_lantern_extension + register_deadline(:wait, 40 * 60) hop_init_sql when "NotStarted" vm.sshable.cmd("common/bin/daemonizer 'sudo lantern/bin/update_lantern' update_lantern", stdin: JSON.generate({version: lantern_server.lantern_version})) @@ -241,6 +262,7 @@ def before_run when "Succeeded" vm.sshable.cmd("common/bin/daemonizer --clean update_extras") decr_update_extras_extension + register_deadline(:wait, 40 * 60) hop_init_sql when "NotStarted" vm.sshable.cmd("common/bin/daemonizer 'sudo lantern/bin/update_extras' update_extras", stdin: JSON.generate({version: lantern_server.extras_version})) @@ -290,6 +312,7 @@ def before_run end decr_add_domain + register_deadline(:wait, 5 * 60) hop_setup_ssl end @@ -299,7 +322,6 @@ def destroy_domain end label def setup_ssl - register_deadline(:wait, 5 * 60) case vm.sshable.cmd("common/bin/daemonizer --check setup_ssl") when "Succeeded" vm.sshable.cmd("common/bin/daemonizer --clean setup_ssl") @@ -351,8 +373,11 @@ def destroy_domain reap when_checkup_set? do - hop_unavailable if !available? decr_checkup + if !available? + register_deadline(:wait, 5 * 60) + hop_unavailable + end end when_update_user_password_set? do @@ -379,6 +404,20 @@ def destroy_domain hop_add_domain end + # We will always update rhizome before updating extensions + # In case something is changed in rhizome scripts + when_update_lantern_extension_set? do + hop_update_rhizome + end + + when_update_extras_extension_set? do + hop_update_rhizome + end + + when_update_image_set? do + hop_update_rhizome + end + when_update_rhizome_set? do hop_update_rhizome end @@ -395,8 +434,6 @@ def destroy_domain end label def unavailable - register_deadline(:wait, 5 * 60) - # TODO # if postgres_server.primary? && (standby = postgres_server.failover_target) # standby.incr_take_over @@ -442,7 +479,9 @@ def destroy_domain destroy_domain end - lantern_server.timeline.incr_destroy + if lantern_server.primary? + lantern_server.timeline.incr_destroy + end lantern_server.destroy vm.incr_destroy diff --git a/prog/lantern/lantern_timeline_nexus.rb b/prog/lantern/lantern_timeline_nexus.rb index 22d851e7f..d0282d800 100644 --- a/prog/lantern/lantern_timeline_nexus.rb +++ b/prog/lantern/lantern_timeline_nexus.rb @@ -54,7 +54,7 @@ def before_run if lantern_timeline.need_cleanup? retain_after = (Time.new - (24 * 60 * 60 * Config.backup_retention_days)).strftime("%Y-%m-%dT%H:%M:%S.%LZ") - cmd = "docker compose -f /var/lib/lantern/docker-compos.yaml exec -T -u root postgresql bash -c \"GOOGLE_APPLICATION_CREDENTIALS=/tmp/google-application-credentials-wal-g.json /opt/bitnami/postgresql/bin/wal-g delete retain FULL 7 --after #{retain_after} --confirm\"" + cmd = "docker compose -f /var/lib/lantern/docker-compose.yaml exec -T -u root postgresql bash -c \"GOOGLE_APPLICATION_CREDENTIALS=/tmp/google-application-credentials-wal-g.json /opt/bitnami/postgresql/bin/wal-g delete retain FULL 7 --after #{retain_after} --confirm\"" lantern_timeline.leader.vm.sshable.cmd("common/bin/daemonizer '#{cmd}' delete_old_backups") end @@ -85,12 +85,6 @@ def before_run label def destroy decr_destroy destroy_blob_storage - if !lantern_timeline.children.empty? - lantern_timeline.children.map do |timeline| - timeline.parent_id = nil - timeline.save_changes - end - end lantern_timeline.destroy pop "lantern timeline is deleted" end diff --git a/routes/web/project/location/lantern.rb b/routes/web/project/location/lantern.rb index 98d395d6f..398dc53b0 100644 --- a/routes/web/project/location/lantern.rb +++ b/routes/web/project/location/lantern.rb @@ -40,8 +40,6 @@ class CloverWeb r.post "update-extension" do Authorization.authorize(@current_user.id, "Postgres:edit", pg.id) - pg.representative_server.incr_update_rhizome - if r.params["lantern_version"] != pg.representative_server.lantern_version pg.representative_server.update(lantern_version: r.params["lantern_version"]) pg.representative_server.incr_update_lantern_extension @@ -59,7 +57,6 @@ class CloverWeb Authorization.authorize(@current_user.id, "Postgres:edit", pg.id) pg.representative_server.update(lantern_version: r.params["img_lantern_version"] || pg.representative_server.lantern_version, extras_version: r.params["img_extras_version"] || pg.representative_server.extras_version, minor_version: r.params["img_minor_version"] || pg.representative_server.minor_version) - pg.representative_server.incr_update_rhizome pg.representative_server.incr_update_image r.redirect "#{@project.path}#{pg.path}" end diff --git a/spec/model/gcp_vm_spec.rb b/spec/model/gcp_vm_spec.rb index d223b1da4..ec1425d9a 100644 --- a/spec/model/gcp_vm_spec.rb +++ b/spec/model/gcp_vm_spec.rb @@ -34,11 +34,6 @@ expect(gcp_vm.display_state).to be_nil end - it ".display_state when destroy set" do - expect(gcp_vm).to receive(:destroy_set?).and_return(true) - expect(gcp_vm.display_state).to eq("deleting") - end - it ".mem_gib_ratio x64" do expect(gcp_vm).to receive(:arch).and_return("amd64") expect(gcp_vm.mem_gib_ratio).to eq(8) diff --git a/spec/model/lantern/lantern_server_spec.rb b/spec/model/lantern/lantern_server_spec.rb index 720566aad..0991eddbd 100644 --- a/spec/model/lantern/lantern_server_spec.rb +++ b/spec/model/lantern/lantern_server_spec.rb @@ -47,32 +47,66 @@ expect(lantern_server.display_state).to eq("updating") end + it "shows updating if init_sql" do + expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "init_sql")).at_least(:once) + expect(lantern_server.display_state).to eq("updating") + end + it "shows running" do + expect(lantern_server.vm).to receive(:display_state).and_return("running").at_least(:once) expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "wait")).at_least(:once) expect(lantern_server.display_state).to eq("running") end it "shows deleting" do + expect(lantern_server.vm).to receive(:display_state).and_return("running").at_least(:once) expect(lantern_server).to receive(:destroy_set?).and_return(false).at_least(:once) expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "destroy")).at_least(:once) expect(lantern_server.display_state).to eq("deleting") end it "shows deleting if destroy set" do + expect(lantern_server.vm).to receive(:display_state).and_return("running").at_least(:once) expect(lantern_server).to receive(:destroy_set?).and_return(true).at_least(:once) expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) expect(lantern_server.display_state).to eq("deleting") end it "shows unavailable" do + expect(lantern_server.vm).to receive(:display_state).and_return("running").at_least(:once) expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "wait_db_available")).at_least(:once) expect(lantern_server.display_state).to eq("unavailable") end it "shows creating" do + expect(lantern_server.vm).to receive(:display_state).and_return("running").at_least(:once) expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) expect(lantern_server.display_state).to eq("creating") end + + it "shows starting" do + expect(lantern_server.vm).to receive(:display_state).and_return("starting").at_least(:once) + expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) + expect(lantern_server.display_state).to eq("starting") + end + + it "shows stopping" do + expect(lantern_server.vm).to receive(:display_state).and_return("stopping").at_least(:once) + expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) + expect(lantern_server.display_state).to eq("stopping") + end + + it "shows stopped" do + expect(lantern_server.vm).to receive(:display_state).and_return("stopped").at_least(:once) + expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) + expect(lantern_server.display_state).to eq("stopped") + end + + it "shows failed" do + expect(lantern_server.vm).to receive(:display_state).and_return("failed").at_least(:once) + expect(lantern_server).to receive(:strand).and_return(instance_double(Strand, label: "unknown")).at_least(:once) + expect(lantern_server.display_state).to eq("failed") + end end it "returns name from ubid" do @@ -456,6 +490,7 @@ reading_chg: Time.now - 30 } + expect(lantern_server).to receive(:display_state).and_return("running") expect(lantern_server).not_to receive(:incr_checkup) lantern_server.check_pulse(session: session, previous_pulse: pulse) end @@ -470,6 +505,7 @@ reading_chg: Time.now - 30 } + expect(lantern_server).to receive(:display_state).and_return("running") expect(lantern_server).to receive(:primary?).and_return(true) expect(lantern_server).not_to receive(:incr_checkup) lantern_server.check_pulse(session: session, previous_pulse: pulse) @@ -485,10 +521,24 @@ reading_chg: Time.now - 30 } + expect(lantern_server).to receive(:display_state).and_return("running") expect(session[:db_connection]).to receive(:[]).and_raise(Sequel::DatabaseConnectionError) expect(lantern_server).to receive(:incr_checkup) lantern_server.check_pulse(session: session, previous_pulse: pulse) end + + it "does not check the pulse if not running" do + session = { + db_connection: instance_double(Sequel::Postgres::Database) + } + pulse = { + reading: "down", + reading_rpt: 5, + reading_chg: Time.now - 30 + } + expect(lantern_server).to receive(:display_state).and_return("stopped") + lantern_server.check_pulse(session: session, previous_pulse: pulse) + end end describe "#prewarm_indexes" do diff --git a/spec/prog/gcp_vm/nexus_spec.rb b/spec/prog/gcp_vm/nexus_spec.rb index 4f98d4202..ada09e9e9 100644 --- a/spec/prog/gcp_vm/nexus_spec.rb +++ b/spec/prog/gcp_vm/nexus_spec.rb @@ -62,7 +62,12 @@ end describe "#create_vm" do - it "Hops to wait_create_vm on start" do + it "Hops to create_vm on start" do + expect(nx).to receive(:register_deadline).with(:failed_provisioning, 10 * 60) + expect { nx.start }.to hop("create_vm") + end + + it "Hops to wait_create_vm" do gcp_api = instance_double(Hosting::GcpApis) expect(Hosting::GcpApis).to receive(:new).and_return(gcp_api) frame = {"labels" => {"parent" => "test-label"}} @@ -71,7 +76,7 @@ expect(nx.strand).to receive(:stack).and_return([frame]).at_least(:once) expect(nx.strand).to receive(:modified!).with(:stack).at_least(:once) expect(nx.strand).to receive(:save_changes).at_least(:once) - expect { nx.start }.to hop("wait_create_vm") + expect { nx.create_vm }.to hop("wait_create_vm") end it "Naps 10 seconds if vm is not running" do @@ -129,8 +134,6 @@ describe "#start_vm" do it "hops to wait_sshable after run" do - expect(gcp_vm).to receive(:update).with({display_state: "starting"}) - expect(gcp_vm).to receive(:update).with({display_state: "running"}) gcp_api = instance_double(Hosting::GcpApis) expect(Hosting::GcpApis).to receive(:new).and_return(gcp_api) expect(gcp_api).to receive(:start_vm).with("dummy-vm", "us-central1-a").and_return({"status" => "DONE"}) @@ -140,7 +143,6 @@ describe "#stop_vm" do it "hops to wait after stop" do - expect(gcp_vm).to receive(:update).with({display_state: "stopping"}) expect(gcp_vm).to receive(:update).with({display_state: "stopped"}) gcp_api = instance_double(Hosting::GcpApis) expect(Hosting::GcpApis).to receive(:new).and_return(gcp_api).at_least(:once) @@ -156,7 +158,6 @@ describe "#destroy" do it "exits after run destroy" do expect(gcp_vm).to receive(:has_static_ipv4).and_return(false) - expect(gcp_vm).to receive(:update).with({display_state: "deleting"}) expect(gcp_vm).to receive(:destroy) gcp_api = instance_double(Hosting::GcpApis) expect(Hosting::GcpApis).to receive(:new).and_return(gcp_api) @@ -166,7 +167,6 @@ it "release ip4 if exists" do expect(gcp_vm).to receive(:has_static_ipv4).and_return(true) - expect(gcp_vm).to receive(:update).with({display_state: "deleting"}) expect(gcp_vm).to receive(:destroy) gcp_api = instance_double(Hosting::GcpApis) expect(Hosting::GcpApis).to receive(:new).and_return(gcp_api) @@ -179,25 +179,34 @@ describe "#wait" do it "hops to stop_vm" do expect(nx).to receive(:when_stop_vm_set?).and_yield + expect(nx).to receive(:register_deadline).with(:wait, 5 * 60) + expect(gcp_vm).to receive(:update).with(display_state: "stopping") expect { nx.wait }.to hop("stop_vm") end it "hops to start_vm" do + expect(nx).to receive(:register_deadline).with(:wait, 5 * 60) + expect(gcp_vm).to receive(:update).with(display_state: "starting") expect(nx).to receive(:when_start_vm_set?).and_yield expect { nx.wait }.to hop("start_vm") end it "hops to destroy" do + expect(gcp_vm).to receive(:update).with(display_state: "deleting") expect(nx).to receive(:when_destroy_set?).and_yield expect { nx.wait }.to hop("destroy") end it "hops to update_storage" do + expect(nx).to receive(:register_deadline).with(:wait, 5 * 60) + expect(gcp_vm).to receive(:update).with(display_state: "updating") expect(nx).to receive(:when_update_storage_set?).and_yield expect { nx.wait }.to hop("update_storage") end it "hops to update_size" do + expect(nx).to receive(:register_deadline).with(:wait, 5 * 60) + expect(gcp_vm).to receive(:update).with(display_state: "updating") expect(nx).to receive(:when_update_size_set?).and_yield expect { nx.wait }.to hop("update_size") end @@ -267,5 +276,12 @@ expect(gcp_vm).to receive(:sshable).and_return(sshable) expect(nx.host).to eq("1.1.1.1") end + + describe "#failed_provisioning" do + it "updates display state" do + expect(gcp_vm).to receive(:update).with(display_state: "failed") + expect { nx.failed_provisioning }.to hop("wait") + end + end end end diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index bba129676..e54d3607b 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -168,6 +168,21 @@ expect(Prog::Lantern::LanternServerNexus).to receive(:assemble) expect { nx.wait }.to nap(30) end + + it "updates display_state" do + expect(lantern_resource).to receive(:required_standby_count).and_return(0) + expect(lantern_resource).to receive(:display_state).and_return("failed") + expect(lantern_resource).to receive(:servers).and_return([instance_double(LanternServer, strand: instance_double(Strand, label: "wait"))]).at_least(:once) + expect(lantern_resource).to receive(:update).with(display_state: nil) + expect { nx.wait }.to nap(30) + end + + it "does not updates display_state" do + expect(lantern_resource).to receive(:required_standby_count).and_return(0) + expect(lantern_resource).to receive(:display_state).and_return("failed") + expect(lantern_resource).to receive(:servers).and_return([instance_double(LanternServer, strand: instance_double(Strand, label: "unavailable"))]).at_least(:once) + expect { nx.wait }.to nap(30) + end end describe "#destroy" do @@ -182,4 +197,11 @@ expect { nx.destroy }.to exit({"msg" => "lantern resource is deleted"}) end end + + describe "#failed_provisioning" do + it "updates display state" do + expect(lantern_resource).to receive(:update).with(display_state: "failed") + expect { nx.failed_provisioning }.to hop("wait") + end + end end diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index b232c9b04..17ba94589 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -13,6 +13,8 @@ ubid: "6ae7e513-c34a-8039-a72a-7be45b53f2a0", id: "6ae7e513-c34a-8039-a72a-7be45b53f2a0", domain: nil, + lantern_version: "0.2.5", + extras_version: "0.1.5", resource: instance_double(LanternResource, org_id: 0, name: "test", @@ -329,23 +331,34 @@ describe "#wait_recovery_completion" do it "hop to wait if recovery finished" do - expect(lantern_server).to receive(:run_query).and_return("t", "paused", "") + expect(lantern_server).to receive(:run_query).and_return("t", "paused", "t", lantern_server.lantern_version, lantern_server.extras_version) expect(lantern_server).to receive(:timeline_id=) expect(lantern_server).to receive(:timeline_access=).with("push") expect(lantern_server).to receive(:save_changes) - expect(lantern_server).to receive(:update_walg_creds) expect(Prog::Lantern::LanternTimelineNexus).to receive(:assemble).and_return(instance_double(Strand, id: "104b0033-b3f6-8214-ae27-0cd3cef18ce5")) - expect { nx.wait_recovery_completion }.to hop("wait") + expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") end it "hop to wait if not in recovery" do - expect(lantern_server).to receive(:run_query).and_return("f") + expect(lantern_server).to receive(:run_query).and_return("f", lantern_server.lantern_version, lantern_server.extras_version) expect(lantern_server).to receive(:timeline_id=) expect(lantern_server).to receive(:timeline_access=).with("push") expect(lantern_server).to receive(:save_changes) - expect(lantern_server).to receive(:update_walg_creds) expect(Prog::Lantern::LanternTimelineNexus).to receive(:assemble).and_return(instance_double(Strand, id: "104b0033-b3f6-8214-ae27-0cd3cef18ce5")) - expect { nx.wait_recovery_completion }.to hop("wait") + expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") + end + + it "update extension on version mismatch" do + expect(lantern_server).to receive(:run_query).and_return("t", "paused", "t", "0.2.4", "0.1.4") + expect(lantern_server).to receive(:timeline_id=) + expect(lantern_server).to receive(:timeline_access=).with("push") + expect(lantern_server).to receive(:save_changes) + expect(lantern_server).to receive(:update).with(lantern_version: "0.2.4") + expect(lantern_server).to receive(:update).with(extras_version: "0.1.4") + expect(nx).to receive(:incr_update_lantern_extension) + expect(nx).to receive(:incr_update_extras_extension) + expect(Prog::Lantern::LanternTimelineNexus).to receive(:assemble).and_return(instance_double(Strand, id: "104b0033-b3f6-8214-ae27-0cd3cef18ce5")) + expect { nx.wait_recovery_completion }.to hop("wait_timeline_available") end it "nap 5" do @@ -623,6 +636,21 @@ expect { nx.wait }.to hop("update_rhizome") end + it "hops to update_rhizome if update lantern set" do + nx.incr_update_lantern_extension + expect { nx.wait }.to hop("update_rhizome") + end + + it "hops to update_rhizome if update extras set" do + nx.incr_update_extras_extension + expect { nx.wait }.to hop("update_rhizome") + end + + it "hops to update_rhizome if update image set" do + nx.incr_update_image + expect { nx.wait }.to hop("update_rhizome") + end + it "hops to destroy" do nx.incr_destroy expect { nx.wait }.to hop("destroy") @@ -659,7 +687,7 @@ describe "#destroy" do it "destroys lantern_server and vm" do expect(lantern_server.vm).to receive(:incr_destroy).at_least(:once) - expect(lantern_server.timeline).to receive(:incr_destroy).at_least(:once) + expect(lantern_server).to receive(:primary?).and_return(false) expect(lantern_server).to receive(:domain).and_return(nil) expect(lantern_server).to receive(:destroy) expect { nx.destroy }.to exit({"msg" => "lantern server was deleted"}) @@ -667,6 +695,7 @@ it "destroys lantern_server, vm and domain" do expect(lantern_server.vm).to receive(:incr_destroy).at_least(:once) + expect(lantern_server).to receive(:primary?).and_return(true) expect(lantern_server.timeline).to receive(:incr_destroy).at_least(:once) expect(lantern_server).to receive(:domain).and_return("example.com") expect(nx).to receive(:destroy_domain) @@ -751,14 +780,12 @@ describe "#unavailable" do it "naps if restarting" do - expect(nx).to receive(:register_deadline) expect(nx).to receive(:reap) expect(nx.strand).to receive(:children).and_return([instance_double(Strand, prog: "Lantern::LanternServerNexus", label: "restart")]) expect { nx.unavailable }.to nap(5) end it "hops to wait if available" do - expect(nx).to receive(:register_deadline) expect(nx).to receive(:reap) expect(nx).to receive(:available?).and_return(true) expect(nx).to receive(:decr_checkup) @@ -766,7 +793,6 @@ end it "hops to wait if available and resolves page" do - expect(nx).to receive(:register_deadline) expect(nx).to receive(:reap) expect(nx).to receive(:available?).and_return(true) expect(nx).to receive(:decr_checkup) @@ -777,7 +803,6 @@ end it "buds restart" do - expect(nx).to receive(:register_deadline) expect(nx).to receive(:reap) expect(nx).to receive(:available?).and_return(false) expect(nx).to receive(:bud).with(described_class, {}, :restart) @@ -787,7 +812,6 @@ end it "naps if already alerted" do - expect(nx).to receive(:register_deadline) expect(nx).to receive(:reap) expect(nx).to receive(:available?).and_return(false) page = instance_double(Page) @@ -826,4 +850,18 @@ expect { nx.prewarm_indexes }.to exit({"msg" => "lantern index prewarm failed"}) end end + + describe "#wait_timeline_available" do + it "naps if timeline is not ready" do + expect(lantern_server).to receive(:timeline).and_return(instance_double(LanternTimeline, strand: instance_double(Strand, label: "start"))) + expect { nx.wait_timeline_available }.to nap(10) + end + + it "hops to wait_db_available" do + expect(lantern_server).to receive(:timeline).and_return(instance_double(LanternTimeline, strand: instance_double(Strand, label: "wait_leader"))) + expect(lantern_server).to receive(:update_walg_creds) + expect(nx).to receive(:decr_initial_provisioning) + expect { nx.wait_timeline_available }.to hop("wait_db_available") + end + end end diff --git a/spec/prog/lantern/lantern_timeline_nexus_spec.rb b/spec/prog/lantern/lantern_timeline_nexus_spec.rb index 4bceec3f3..1fca3fd0b 100644 --- a/spec/prog/lantern/lantern_timeline_nexus_spec.rb +++ b/spec/prog/lantern/lantern_timeline_nexus_spec.rb @@ -96,7 +96,7 @@ expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(true) expect(timeline).to receive(:leader).and_return(lantern_server) (Time.new - (24 * 60 * 60 * Config.backup_retention_days)).strftime("%Y-%m-%dT%H:%M:%S.%LZ") - expect(lantern_server.vm.sshable).to receive(:cmd).with(a_string_matching("common/bin/daemonizer 'docker compose -f /var/lib/lantern/docker-compos.yaml exec -T -u root postgresql bash -c")) + expect(lantern_server.vm.sshable).to receive(:cmd).with(a_string_matching("common/bin/daemonizer 'docker compose -f /var/lib/lantern/docker-compose.yaml exec -T -u root postgresql bash -c")) expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}]) expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server) expect { nx.wait }.to nap(20 * 60) @@ -166,22 +166,8 @@ end describe "#destroy" do - it "removes parent from children and exit with message" do - expect(lantern_server.timeline).to receive(:destroy) - child = instance_double(LanternTimeline, parent_id: "test") - children = [child] - api = instance_double(Hosting::GcpApis) - allow(api).to receive(:remove_service_account) - allow(Hosting::GcpApis).to receive(:new).and_return(api) - expect(child).to receive(:parent_id=).with(nil) - expect(child).to receive(:save_changes) - expect(lantern_server.timeline).to receive(:children).and_return(children).twice - expect { nx.destroy }.to exit({"msg" => "lantern timeline is deleted"}) - end - it "exits with message without deleting sa" do expect(lantern_server.timeline).to receive(:destroy) - expect(lantern_server.timeline).to receive(:children).and_return([]) expect(lantern_server.timeline).to receive(:service_account_name).and_return(nil) expect { nx.destroy }.to exit({"msg" => "lantern timeline is deleted"}) end @@ -191,7 +177,6 @@ allow(api).to receive(:remove_service_account) allow(Hosting::GcpApis).to receive(:new).and_return(api) expect(lantern_server.timeline).to receive(:destroy) - expect(lantern_server.timeline).to receive(:children).and_return([]) expect { nx.destroy }.to exit({"msg" => "lantern timeline is deleted"}) end end