diff --git a/model/lantern/lantern_resource.rb b/model/lantern/lantern_resource.rb index 1a246d63f..6f4a8bccb 100644 --- a/model/lantern/lantern_resource.rb +++ b/model/lantern/lantern_resource.rb @@ -311,8 +311,9 @@ def rollback_switchover current_resource.representative_server.stop_container(1) rescue end + current_resource.representative_server.incr_container_stopped - representative_server.start_container + representative_server.incr_take_over # update dns cf_client = Dns::Cloudflare.new diff --git a/model/sshable.rb b/model/sshable.rb index 14d9b394e..cceb0f9dc 100644 --- a/model/sshable.rb +++ b/model/sshable.rb @@ -30,16 +30,17 @@ def keys } end - def cmd(cmd, stdin: nil, log: true) + def cmd(command, stdin: nil, log: true) start = Time.now stdout = StringIO.new stderr = StringIO.new exit_code = nil exit_signal = nil + has_cached_session = !Thread.current[:clover_ssh_cache].nil? && !Thread.current[:clover_ssh_cache][[host, unix_user]].nil? begin connect.open_channel do |ch| - ch.exec(cmd) do |ch, success| + ch.exec(command) do |ch, success| ch.on_data do |ch, data| $stderr.write(data) if REPL stdout.write(data) @@ -64,6 +65,12 @@ def cmd(cmd, stdin: nil, log: true) end.wait rescue invalidate_cache_entry + + if has_cached_session + # if the session was cached previously + # we will retry command as ssh session may be closed + return cmd(command, stdin: stdin, log: log) + end raise end @@ -74,7 +81,7 @@ def cmd(cmd, stdin: nil, log: true) Clog.emit("ssh cmd execution") do finish = Time.now embed = {start: start, finish: finish, duration: finish - start, - cmd: cmd, + cmd: command, exit_code: exit_code, exit_signal: exit_signal} # Suppress large outputs to avoid annoyance in duplication @@ -92,7 +99,7 @@ def cmd(cmd, stdin: nil, log: true) end end - fail SshError.new(cmd, stdout_str, stderr.string.freeze, exit_code, exit_signal) unless exit_code.zero? + fail SshError.new(command, stdout_str, stderr.string.freeze, exit_code, exit_signal) unless exit_code.zero? stdout_str end diff --git a/prog/lantern/lantern_resource_nexus.rb b/prog/lantern/lantern_resource_nexus.rb index e432f86c8..65f763494 100644 --- a/prog/lantern/lantern_resource_nexus.rb +++ b/prog/lantern/lantern_resource_nexus.rb @@ -327,6 +327,7 @@ def before_run label def switch_dns_with_parent lantern_resource.parent.representative_server.stop_container(1) lantern_resource.update(logical_replication: false) + lantern_resource.parent.representative_server.incr_container_stopped if lantern_resource.parent.representative_server.domain.nil? hop_finish_take_over diff --git a/prog/lantern/lantern_server_nexus.rb b/prog/lantern/lantern_server_nexus.rb index 313694955..9d8df48a7 100644 --- a/prog/lantern/lantern_server_nexus.rb +++ b/prog/lantern/lantern_server_nexus.rb @@ -523,11 +523,11 @@ def before_run label def container_stopped decr_container_stopped when_take_over_set? do - vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} up -d") + lantern_server.start_container hop_take_over end - nap 15 + nap 10 end label def promote_server diff --git a/spec/model/lantern/lantern_resource_spec.rb b/spec/model/lantern/lantern_resource_spec.rb index c1bf04ddd..7ecbe6cbf 100644 --- a/spec/model/lantern/lantern_resource_spec.rb +++ b/spec/model/lantern/lantern_resource_spec.rb @@ -357,7 +357,7 @@ expect(current_resource.representative_server).to receive(:stop_container).with(1).and_return(true).at_least(:once) - expect(old_representative_server).to receive(:start_container) + expect(old_representative_server).to receive(:incr_take_over) cf_client = instance_double(Dns::Cloudflare) allow(Dns::Cloudflare).to receive(:new).and_return(cf_client) @@ -368,6 +368,7 @@ expect(old_representative_server).to receive(:update).with(domain: current_resource.representative_server.domain) expect(current_resource.representative_server).to receive(:update).with(domain: nil) + expect(current_resource.representative_server).to receive(:incr_container_stopped) expect(lantern_resource).to receive(:update).with(rollback_target: nil) diff --git a/spec/model/sshable_spec.rb b/spec/model/sshable_spec.rb index 36395d6f4..09b855482 100644 --- a/spec/model/sshable_spec.rb +++ b/spec/model/sshable_spec.rb @@ -157,5 +157,15 @@ def simulate(cmd:, exit_status:, exit_signal:, stdout:, stderr:) expect(sa).to receive(:invalidate_cache_entry) expect { sa.cmd("irrelevant") }.to raise_error err end + + it "invalidates the cache if the session retries and raises on second try" do + err = IOError.new("the party is over") + expect(session).to receive(:open_channel).and_raise(err).at_least(:once) + cache = instance_double(Hash) + expect(Thread.current).to receive(:[]).with(:clover_ssh_cache).and_return(cache).at_least(:once) + expect(cache).to receive(:[]).with(["test.localhost", "testuser"]).and_return(session, nil).at_least(:once) + expect(sa).to receive(:invalidate_cache_entry).at_least(:once) + expect { sa.cmd("irrelevant") }.to raise_error err + end end end diff --git a/spec/prog/lantern/lantern_resource_nexus_spec.rb b/spec/prog/lantern/lantern_resource_nexus_spec.rb index 00b2879fe..4364b89af 100644 --- a/spec/prog/lantern/lantern_resource_nexus_spec.rb +++ b/spec/prog/lantern/lantern_resource_nexus_spec.rb @@ -468,6 +468,7 @@ parent = instance_double(LanternResource, representative_server: instance_double(LanternServer, domain: nil)) expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) expect(lantern_resource.parent.representative_server).to receive(:stop_container) + expect(lantern_resource.parent.representative_server).to receive(:incr_container_stopped) expect { nx.switch_dns_with_parent }.to hop("finish_take_over") end @@ -475,6 +476,7 @@ parent = instance_double(LanternResource, representative_server: instance_double(LanternServer, domain: "test-domain")) expect(lantern_resource).to receive(:parent).and_return(parent).at_least(:once) expect(lantern_resource.parent.representative_server).to receive(:stop_container) + expect(lantern_resource.parent.representative_server).to receive(:incr_container_stopped) expect(lantern_resource.representative_server).to receive(:swap_dns).with(parent.representative_server) expect(lantern_resource).to receive(:update).with(logical_replication: false) expect { nx.switch_dns_with_parent }.to hop("wait_switch_dns") diff --git a/spec/prog/lantern/lantern_server_nexus_spec.rb b/spec/prog/lantern/lantern_server_nexus_spec.rb index 0487788d1..fdcc3b818 100644 --- a/spec/prog/lantern/lantern_server_nexus_spec.rb +++ b/spec/prog/lantern/lantern_server_nexus_spec.rb @@ -1051,12 +1051,12 @@ describe "#container_stopped" do it "hops to take_over" do nx.incr_take_over - expect(lantern_server.vm.sshable).to receive(:cmd) + expect(lantern_server).to receive(:start_container) expect { nx.container_stopped }.to hop("take_over") end - it "naps 15" do - expect { nx.container_stopped }.to nap(15) + it "naps 10" do + expect { nx.container_stopped }.to nap(10) end end