Skip to content

Commit

Permalink
Fix lantern server display status based on vm status, fix deadlines, …
Browse files Browse the repository at this point in the history
…dont check pulse if not running
  • Loading branch information
var77 committed May 2, 2024
1 parent 4c212d9 commit b312eef
Show file tree
Hide file tree
Showing 16 changed files with 255 additions and 83 deletions.
6 changes: 3 additions & 3 deletions lib/option.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ def self.lantern_locations_for_provider(provider)
VmSize = Struct.new(:name, :family, :vcpu, :memory, :storage_size_gib) do
alias_method :display_name, :name
end
VmSizes = [2, 4, 8, 16, 32, 64].map {
VmSizes = [1, 2, 4, 8, 16, 32, 64].map {
VmSize.new("n1-standard-#{_1}", "n1-standard", _1, _1 * 4, (_1 / 2) * 25)
}.freeze

LanternSize = Struct.new(:name, :vm_size, :family, :vcpu, :memory, :storage_size_gib) do
alias_method :display_name, :name
end

LanternSizes = [2, 4, 8, 16, 32, 64].map {
LanternSize.new("n1-standard-#{_1}", "n1-standard-#{_1}", "n1-standard", _1, _1 * 4, (_1 / 2) * 128)
LanternSizes = [1, 2, 4, 8, 16, 32, 64].map {
LanternSize.new("n1-standard-#{_1}", "n1-standard-#{_1}", "n1-standard", _1, _1 * 4, _1 * 64)
}.freeze

LanternHaOption = Struct.new(:name, :standby_count, :title, :explanation)
Expand Down
9 changes: 9 additions & 0 deletions migrate/20240502_lantern_resource_display_state.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# frozen_string_literal: true

Sequel.migration do
change do
alter_table(:lantern_resource) do
add_column :display_state, :text, null: true
end
end
end
5 changes: 0 additions & 5 deletions model/gcp_vm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@ def host
sshable&.host
end

def display_state
return "deleting" if destroy_set?
super
end

def mem_gib_ratio
return 3.2 if arch == "arm64"
8
Expand Down
2 changes: 1 addition & 1 deletion model/lantern/lantern_resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def path
end

def display_state
representative_server&.display_state || "unavailable"
super || representative_server&.display_state || "unavailable"
end

def connection_string
Expand Down
10 changes: 10 additions & 0 deletions model/lantern/lantern_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ def display_state
return "domain setup" if strand.label.include?("domain")
return "ssl setup" if strand.label.include?("setup_ssl")
return "updating" if strand.label.include?("update")
return "updating" if strand.label.include?("init_sql")
return "stopped" if vm.display_state.include?("stopped")
return "stopping" if vm.display_state.include?("stopping")
return "starting" if vm.display_state.include?("starting")
return "failed" if vm.display_state.include?("failed")
return "unavailable" if strand.label.include?("wait_db_available")
return "running" if ["wait"].include?(strand.label)
return "deleting" if destroy_set? || strand.label == "destroy"
Expand Down Expand Up @@ -147,6 +152,11 @@ def init_health_monitor_session
end

def check_pulse(session:, previous_pulse:)
if display_state != "running"
# if there's an operation ongoing, do not check the pulse
return previous_pulse
end

reading = begin
session[:db_connection] ||= Sequel.connect(connection_string)
lsn_function = primary? ? "pg_current_wal_lsn()" : "pg_last_wal_receive_lsn()"
Expand Down
33 changes: 23 additions & 10 deletions prog/gcp_vm/nexus.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true

require "forwardable"
require "netaddr"
require "json"
require "shellwords"
Expand All @@ -10,6 +11,10 @@

class Prog::GcpVm::Nexus < Prog::Base
subject_is :gcp_vm

extend Forwardable
def_delegators :gcp_vm

semaphore :destroy, :start_vm, :stop_vm, :update_storage, :update_size

def self.assemble(public_key, project_id, name: nil, size: "n1-standard-2",
Expand Down Expand Up @@ -85,10 +90,14 @@ def host
end

label def start
register_deadline(:failed_provisioning, 10 * 60)
hop_create_vm
end

label def create_vm
gcp_client = Hosting::GcpApis.new
labels = frame["labels"]
gcp_client.create_vm(gcp_vm.name, "#{gcp_vm.location}-a", gcp_vm.boot_image, gcp_vm.public_key, gcp_vm.unix_user, "#{gcp_vm.family}-#{gcp_vm.cores}", gcp_vm.storage_size_gib, labels: labels)
register_deadline(:wait, 10 * 60)

# remove labels from stack
current_frame = strand.stack.first
Expand All @@ -99,6 +108,11 @@ def host
hop_wait_create_vm
end

label def failed_provisioning
gcp_vm.update(display_state: "failed")
hop_wait
end

label def wait_sshable
addr = gcp_vm.sshable.host

Expand All @@ -123,23 +137,30 @@ def host
label def wait
when_stop_vm_set? do
register_deadline(:wait, 5 * 60)
gcp_vm.update(display_state: "stopping")
hop_stop_vm
end

when_start_vm_set? do
register_deadline(:wait, 5 * 60)
gcp_vm.update(display_state: "starting")
hop_start_vm
end

when_destroy_set? do
gcp_vm.update(display_state: "deleting")
hop_destroy
end

when_update_size_set? do
register_deadline(:wait, 5 * 60)
gcp_vm.update(display_state: "updating")
hop_update_size
end

when_update_storage_set? do
register_deadline(:wait, 5 * 60)
gcp_vm.update(display_state: "updating")
hop_update_storage
end

Expand All @@ -157,8 +178,6 @@ def host
end

label def stop_vm
gcp_vm.update(display_state: "stopping")

gcp_client = Hosting::GcpApis.new
gcp_client.stop_vm(gcp_vm.name, "#{gcp_vm.location}-a")

Expand All @@ -168,13 +187,9 @@ def host
end

label def start_vm
gcp_vm.update(display_state: "starting")

gcp_client = Hosting::GcpApis.new
gcp_client.start_vm(gcp_vm.name, "#{gcp_vm.location}-a")

gcp_vm.update(display_state: "running")

decr_start_vm

hop_wait_sshable
Expand All @@ -185,7 +200,6 @@ def host
hop_stop_vm
end
decr_update_storage
register_deadline(:wait, 5 * 60)
gcp_vm.update(display_state: "updating")
gcp_client = Hosting::GcpApis.new
zone = "#{gcp_vm.location}-a"
Expand All @@ -205,12 +219,12 @@ def host
hop_stop_vm
end
decr_update_size
register_deadline(:wait, 5 * 60)
gcp_vm.update(display_state: "updating")
gcp_client = Hosting::GcpApis.new
gcp_client.update_vm_type(gcp_vm.name, "#{gcp_vm.location}-a", gcp_vm.display_size)

when_update_storage_set? do
register_deadline(:wait, 5 * 60)
hop_update_storage
end

Expand All @@ -219,7 +233,6 @@ def host

label def destroy
DB.transaction do
gcp_vm.update(display_state: "deleting")
gcp_client = Hosting::GcpApis.new
gcp_client.delete_vm(gcp_vm.name, "#{gcp_vm.location}-a")
if gcp_vm.has_static_ipv4
Expand Down
11 changes: 10 additions & 1 deletion prog/lantern/lantern_resource_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,18 @@ def before_run

label def start
nap 5 unless representative_server.vm.strand.label == "wait"
register_deadline(:wait, 10 * 60)
register_deadline(:failed_provisioning, 10 * 60)
# bud self.class, frame, :trigger_pg_current_xact_id_on_parent if lantern_resource.parent

# hop_wait_trigger_pg_current_xact_id_on_parent
hop_wait_servers
end

label def failed_provisioning
lantern_resource.update(display_state: "failed")
hop_wait
end

# TODO:: check why is this needed
# label def trigger_pg_current_xact_id_on_parent
# lantern_resource.parent.representative_server.run_query("SELECT pg_current_xact_id()")
Expand All @@ -142,6 +147,10 @@ def before_run
Prog::Lantern::LanternServerNexus.assemble(resource_id: lantern_resource.id, timeline_id: lantern_resource.timeline.id, timeline_access: "fetch")
end

if lantern_resource.display_state == "failed" && servers.any? { _1.strand.label == "wait" }
lantern_resource.update(display_state: nil)
end

nap 30
end

Expand Down
65 changes: 52 additions & 13 deletions prog/lantern/lantern_server_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,10 @@ def before_run

label def wait_bootstrap_rhizome
reap
hop_setup_docker_stack if leaf?
if leaf?
register_deadline(:wait, 10 * 60)
hop_setup_docker_stack
end
donate
end

Expand All @@ -120,8 +123,6 @@ def before_run
raise "GCP_CREDS_GCR_B64 is required to setup docker stack for Lantern"
end

register_deadline(:wait, 10 * 60)

case vm.sshable.cmd("common/bin/daemonizer --check configure_lantern")
when "Succeeded"
vm.sshable.cmd("common/bin/daemonizer --clean configure_lantern")
Expand All @@ -137,8 +138,6 @@ def before_run
end

label def init_sql
register_deadline(:wait, 40 * 60)

case vm.sshable.cmd("common/bin/daemonizer --check init_sql")
when "Succeeded"
vm.sshable.cmd("common/bin/daemonizer --clean init_sql")
Expand Down Expand Up @@ -190,21 +189,42 @@ def before_run
lantern_server.timeline_access = "push"
lantern_server.save_changes

lantern_server.update_walg_creds
lantern_version = lantern_server.run_query("SELECT extversion FROM pg_extension WHERE extname='lantern'")
extras_version = lantern_server.run_query("SELECT extversion FROM pg_extension WHERE extname='lantern_extras'")

hop_wait
if lantern_version != lantern_server.lantern_version
incr_update_lantern_extension
lantern_server.update(lantern_version: lantern_version)
end

if extras_version != lantern_server.extras_version
incr_update_extras_extension
lantern_server.update(extras_version: extras_version)
end

hop_wait_timeline_available
end

nap 5
end

label def wait_timeline_available
nap 10 if lantern_server.timeline.strand.label != "wait_leader"
lantern_server.update_walg_creds
decr_initial_provisioning
hop_wait_db_available
end

label def wait_db_available
nap 10 if !available?

when_initial_provisioning_set? do
decr_initial_provisioning

hop_init_sql if lantern_server.primary?
if lantern_server.primary?
register_deadline(:wait, 40 * 60)
hop_init_sql
end
hop_wait_catch_up if lantern_server.standby?
hop_wait_recovery_completion
end
Expand All @@ -222,6 +242,7 @@ def before_run
when "Succeeded"
vm.sshable.cmd("common/bin/daemonizer --clean update_lantern")
decr_update_lantern_extension
register_deadline(:wait, 40 * 60)
hop_init_sql
when "NotStarted"
vm.sshable.cmd("common/bin/daemonizer 'sudo lantern/bin/update_lantern' update_lantern", stdin: JSON.generate({version: lantern_server.lantern_version}))
Expand All @@ -241,6 +262,7 @@ def before_run
when "Succeeded"
vm.sshable.cmd("common/bin/daemonizer --clean update_extras")
decr_update_extras_extension
register_deadline(:wait, 40 * 60)
hop_init_sql
when "NotStarted"
vm.sshable.cmd("common/bin/daemonizer 'sudo lantern/bin/update_extras' update_extras", stdin: JSON.generate({version: lantern_server.extras_version}))
Expand Down Expand Up @@ -290,6 +312,7 @@ def before_run
end

decr_add_domain
register_deadline(:wait, 5 * 60)
hop_setup_ssl
end

Expand All @@ -299,7 +322,6 @@ def destroy_domain
end

label def setup_ssl
register_deadline(:wait, 5 * 60)
case vm.sshable.cmd("common/bin/daemonizer --check setup_ssl")
when "Succeeded"
vm.sshable.cmd("common/bin/daemonizer --clean setup_ssl")
Expand Down Expand Up @@ -351,8 +373,11 @@ def destroy_domain
reap

when_checkup_set? do
hop_unavailable if !available?
decr_checkup
if !available?
register_deadline(:wait, 5 * 60)
hop_unavailable
end
end

when_update_user_password_set? do
Expand All @@ -379,6 +404,20 @@ def destroy_domain
hop_add_domain
end

# We will always update rhizome before updating extensions
# In case something is changed in rhizome scripts
when_update_lantern_extension_set? do
hop_update_rhizome
end

when_update_extras_extension_set? do
hop_update_rhizome
end

when_update_image_set? do
hop_update_rhizome
end

when_update_rhizome_set? do
hop_update_rhizome
end
Expand All @@ -395,8 +434,6 @@ def destroy_domain
end

label def unavailable
register_deadline(:wait, 5 * 60)

# TODO
# if postgres_server.primary? && (standby = postgres_server.failover_target)
# standby.incr_take_over
Expand Down Expand Up @@ -442,7 +479,9 @@ def destroy_domain
destroy_domain
end

lantern_server.timeline.incr_destroy
if lantern_server.primary?
lantern_server.timeline.incr_destroy
end
lantern_server.destroy

vm.incr_destroy
Expand Down
Loading

0 comments on commit b312eef

Please sign in to comment.