Skip to content

Commit

Permalink
check database inconsistencies before switchover, disable bigquery lo…
Browse files Browse the repository at this point in the history
…gs for small instances
  • Loading branch information
var77 committed Nov 22, 2024
1 parent dea32fc commit 9ec830c
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 25 deletions.
45 changes: 45 additions & 0 deletions model/lantern/lantern_resource.rb
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,51 @@ module HaType
SYNC = "sync"
end

def prepare_switchover(force = false)
if parent.nil? || !logical_replication
fail "Database does not have parent or is not in logical replication state"
end

if !force
err = ""
replica_dbs = representative_server.list_all_databases
parent_dbs = parent.representative_server.list_all_databases
db_diff = parent_dbs - replica_dbs

if db_diff.any?
err = "The following databases were not synced to replica: #{db_diff.join(",")}\n"
end

replica_roles = representative_server.list_all_roles
parent_roles = parent.representative_server.list_all_roles
roles_diff = parent_roles - replica_roles

if roles_diff.any?
err = "#{err}The following roles were not synced to replica: #{roles_diff.join(",")}\n"
end

lo_count_replica = representative_server.run_query("SELECT COUNT(*) FROM pg_largeobject_metadata")
lo_count_parent = parent.representative_server.run_query("SELECT COUNT(*) FROM pg_largeobject_metadata")
lo_diff = lo_count_parent.to_i - lo_count_replica.to_i

if lo_diff > 0
err = "#{err}Parent database has #{lo_diff} more large objects than replica\n"
end

if !err.empty?
err = "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}"
fail err
end
else
current_frame = strand.stack.first
current_frame["force_switchover"] = true
strand.modified!(:stack)
strand.save_changes
end

incr_switchover_with_parent
end

def rollback_switchover
current_resource = LanternResource[rollback_target]
# stop current one and start old one
Expand Down
23 changes: 22 additions & 1 deletion model/lantern/lantern_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ def configure_hash
postgresql_recovery_target_lsn = ""
end

big_query_dataset = ""

if vm.cores > 1
# enable big query logs only if this is not the smallest instance
big_query_dataset = Config.lantern_log_dataset
end

JSON.generate({
enable_coredumps: true,
skip_deps: vm.boot_image != Config.gcp_default_image,
Expand Down Expand Up @@ -153,7 +160,7 @@ def configure_hash
gcp_creds_walg_b64: walg_config[:gcp_creds_b64],
walg_gs_prefix: walg_config[:walg_gs_prefix],
gcp_creds_big_query_b64: resource.gcp_creds_b64,
big_query_dataset: Config.lantern_log_dataset,
big_query_dataset: big_query_dataset,
pg_version: resource.pg_version
})
end
Expand Down Expand Up @@ -249,6 +256,20 @@ def list_all_databases
.map { _1.strip }
end

def list_all_roles(login = true)
condition = if login
"WHERE rolcanlogin=TRUE"
else
""
end

vm.sshable.cmd("sudo docker compose -f #{Config.compose_file} exec postgresql psql -U postgres -t -c 'SELECT rolname FROM pg_roles #{condition}'")
.chomp
.strip
.split("\n")
.map { _1.strip }
end

def autoresize_disk
new_storage_size = (target_storage_size_gib * 1.5).clamp(..max_storage_autoresize_gib)
return if new_storage_size < target_storage_size_gib
Expand Down
7 changes: 6 additions & 1 deletion prog/lantern/lantern_resource_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,11 @@ def before_run

lantern_resource.mark_switchover_finish

current_frame = strand.stack.first
current_frame.delete("force_switchover")
strand.modified!(:stack)
strand.save_changes

hop_wait
end

Expand Down Expand Up @@ -348,7 +353,7 @@ def before_run
end

label def wait_for_synchronization
nap 5 if lantern_resource.parent.get_logical_replication_lag("slot_#{lantern_resource.ubid}") != 0
nap 5 if !frame["force_switchover"] && lantern_resource.parent.get_logical_replication_lag("slot_#{lantern_resource.ubid}") != 0
hop_delete_logical_subscription
end

Expand Down
10 changes: 4 additions & 6 deletions routes/api/project/location/lantern.rb
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,12 @@ class CloverApi

r.post "switchover" do
Authorization.authorize(@current_user.id, "Postgres:edit", pg.id)

if pg.parent.nil? || !pg.logical_replication
fail CloverError.new(400, "Invalid request", "Database does not have parent or is not in logical replication state")
end

pg.incr_switchover_with_parent
pg.prepare_switchover(r.params["force"])
response.status = 200
r.halt
rescue => e
response.status = 422
return {"error" => e.message}
end

r.post "rollback-switchover" do
Expand Down
97 changes: 97 additions & 0 deletions spec/model/lantern/lantern_resource_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -406,4 +406,101 @@
expect { lantern_resource.mark_switchover_finish }.not_to raise_error
end
end

describe "#prepare_switchover" do
it "fails if no parent" do
expect(lantern_resource).to receive(:parent).and_return(nil)
expect { lantern_resource.prepare_switchover }.to raise_error "Database does not have parent or is not in logical replication state"
end

it "fails if not in logical replication" do
parent = instance_double(described_class)
expect(lantern_resource).to receive(:parent).and_return(parent)
expect(lantern_resource).to receive(:logical_replication).and_return(false)
expect { lantern_resource.prepare_switchover }.to raise_error "Database does not have parent or is not in logical replication state"
end

it "success if force" do
parent = instance_double(described_class)
expect(lantern_resource).to receive(:parent).and_return(parent)
expect(lantern_resource).to receive(:logical_replication).and_return(true)
expect(lantern_resource).to receive(:incr_switchover_with_parent)
frame = instance_double(Hash)
strand = instance_double(Strand, stack: [frame])
expect(frame).to receive(:[]=).with("force_switchover", true)
expect(strand).to receive(:modified!)
expect(strand).to receive(:save_changes)
expect(lantern_resource).to receive(:strand).and_return(strand).at_least(:once)
expect { lantern_resource.prepare_switchover(true) }.not_to raise_error
end

it "fails if db list differs" do
representative_server = instance_double(LanternServer)
parent_representative_server = instance_double(LanternServer)
parent = instance_double(described_class, representative_server: parent_representative_server)
parent_databases = ["db1", "db2", "db3"]
replica_databases = ["db1"]

allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent)
expect(lantern_resource).to receive(:logical_replication).and_return(true)

allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: [], run_query: "0")
allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: [], run_query: "0")

err = "The following databases were not synced to replica: db2,db3\n"
expect { lantern_resource.prepare_switchover }.to raise_error "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}"
end

it "fails if role list differs" do
representative_server = instance_double(LanternServer)
parent_representative_server = instance_double(LanternServer)
parent = instance_double(described_class, representative_server: parent_representative_server)
parent_databases = ["db1", "db2", "db3"]
replica_databases = ["db1", "db2", "db3"]

allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent)
expect(lantern_resource).to receive(:logical_replication).and_return(true)

allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: ["postgres", "role2"], run_query: "0")
allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: ["postgres"], run_query: "0")

err = "The following roles were not synced to replica: role2\n"
expect { lantern_resource.prepare_switchover }.to raise_error "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}"
end

it "fails if large object count differs" do
representative_server = instance_double(LanternServer)
parent_representative_server = instance_double(LanternServer)
parent = instance_double(described_class, representative_server: parent_representative_server)
parent_databases = ["db1", "db2", "db3"]
replica_databases = ["db1"]

allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent)
expect(lantern_resource).to receive(:logical_replication).and_return(true)

allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: ["postgres", "role2"], run_query: "5")
allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: ["postgres"], run_query: "0")

err = "The following databases were not synced to replica: db2,db3\nThe following roles were not synced to replica: role2\nParent database has 5 more large objects than replica\n"
expect { lantern_resource.prepare_switchover }.to raise_error "Inconsistencies found between parent and replica databases.\nPlease synchronize databases manually or create new replica or pass force=true if you are sure you want to switchover\n#{err}"
end

it "success if all conditions pass" do
representative_server = instance_double(LanternServer)
parent_representative_server = instance_double(LanternServer)
parent = instance_double(described_class, representative_server: parent_representative_server)
parent_databases = ["db1", "db2", "db3"]
replica_databases = ["db1", "db2", "db3"]

allow(lantern_resource).to receive_messages(representative_server: representative_server, parent: parent)
expect(lantern_resource).to receive(:logical_replication).and_return(true)

allow(parent_representative_server).to receive_messages(list_all_databases: parent_databases, list_all_roles: ["postgres"], run_query: "5")
allow(representative_server).to receive_messages(list_all_databases: replica_databases, list_all_roles: ["postgres"], run_query: "5")

allow(lantern_resource).to receive(:incr_switchover_with_parent)

expect { lantern_resource.prepare_switchover }.not_to raise_error
end
end
end
18 changes: 17 additions & 1 deletion spec/model/lantern/lantern_server_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@
expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once)
expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once)
expect(vm).to receive(:boot_image).and_return(Config.gcp_default_image).at_least(:once)
expect(vm).to receive(:cores).and_return(2)

walg_conf = timeline.generate_walg_config
expected_conf = JSON.generate({
Expand Down Expand Up @@ -354,6 +355,7 @@
expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once)
expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once)
expect(vm).to receive(:boot_image).and_return("custom-image").at_least(:once)
expect(vm).to receive(:cores).and_return(2)

walg_conf = timeline.generate_walg_config
expected_conf = JSON.generate({
Expand Down Expand Up @@ -421,6 +423,7 @@
expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once)
expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once)
expect(vm).to receive(:boot_image).and_return("custom-image").at_least(:once)
expect(vm).to receive(:cores).and_return(1)

walg_conf = timeline.generate_walg_config
expected_conf = JSON.generate({
Expand Down Expand Up @@ -451,7 +454,7 @@
gcp_creds_walg_b64: walg_conf[:gcp_creds_b64],
walg_gs_prefix: walg_conf[:walg_gs_prefix],
gcp_creds_big_query_b64: resource.gcp_creds_b64,
big_query_dataset: Config.lantern_log_dataset,
big_query_dataset: "",
pg_version: 17
})
expect(lantern_server.configure_hash).to eq(expected_conf)
Expand Down Expand Up @@ -489,6 +492,7 @@
expect(lantern_server).to receive(:extras_version).and_return("0.1.4").at_least(:once)
expect(lantern_server).to receive(:minor_version).and_return("1").at_least(:once)
expect(vm).to receive(:boot_image).and_return("custom-image").at_least(:once)
expect(vm).to receive(:cores).and_return(2)

walg_conf = timeline.generate_walg_config
expected_conf = JSON.generate({
Expand Down Expand Up @@ -696,6 +700,18 @@
end
end

describe "#list_all_roles" do
it "returns list of all roles which can login" do
expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec postgresql psql -U postgres -t -c 'SELECT rolname FROM pg_roles WHERE rolcanlogin=TRUE'").and_return("postgres\nrole2\n")
expect(lantern_server.list_all_roles).to eq(["postgres", "role2"])
end

it "returns list of all roles" do
expect(lantern_server.vm.sshable).to receive(:cmd).with("sudo docker compose -f /var/lib/lantern/docker-compose.yaml exec postgresql psql -U postgres -t -c 'SELECT rolname FROM pg_roles '").and_return("postgres\nrole2\nrole3\n")
expect(lantern_server.list_all_roles(false)).to eq(["postgres", "role2", "role3"])
end
end

describe "#get_vm_image" do
it "returns default image" do
allow(described_class).to receive(:get_vm_image).and_call_original
Expand Down
14 changes: 14 additions & 0 deletions spec/prog/lantern/lantern_resource_nexus_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,13 @@
expect(lantern_resource).to receive(:mark_switchover_finish)
expect(timeline).to receive(:update).with(parent_id: nil)

frame = instance_double(Hash)
strand = instance_double(Strand, stack: [frame])
expect(frame).to receive(:delete).with("force_switchover")
expect(strand).to receive(:modified!)
expect(strand).to receive(:save_changes)
expect(nx).to receive(:strand).and_return(strand).at_least(:once)

expect { nx.finish_take_over }.to hop("wait")
end
end
Expand All @@ -471,12 +478,19 @@
parent = instance_double(LanternResource)
expect(lantern_resource).to receive(:parent).and_return(parent)
expect(parent).to receive(:get_logical_replication_lag).with("slot_#{lantern_resource.ubid}").and_return(5)
expect(nx).to receive(:frame).and_return({"force_switchover" => false})

expect { nx.wait_for_synchronization }.to nap(5)
end

it "hops to delete_logical_subscription if force_switchover" do
expect(nx).to receive(:frame).and_return({"force_switchover" => true})
expect { nx.wait_for_synchronization }.to hop("delete_logical_subscription")
end

it "hops to delete_logical_subscription" do
parent = instance_double(LanternResource)
expect(nx).to receive(:frame).and_return({"force_switchover" => false})
expect(lantern_resource).to receive(:parent).and_return(parent)
expect(parent).to receive(:get_logical_replication_lag).with("slot_#{lantern_resource.ubid}").and_return(0)

Expand Down
21 changes: 5 additions & 16 deletions spec/routes/api/project/location/lantern_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -373,31 +373,20 @@
query_res = class_double(LanternResource, first: pg)
allow(query_res).to receive(:where).and_return(query_res)
expect(project).to receive(:lantern_resources_dataset).and_return(query_res)
err = "Database does not have parent or is not in logical replication state"
expect(pg).to receive(:prepare_switchover).and_raise err

post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/switchover"
expect(last_response.status).to eq(400)
end

it "fails because not in logical replication mode" do
expect(Project).to receive(:from_ubid).and_return(project).at_least(:once)
query_res = class_double(LanternResource, first: pg)
expect(pg).to receive(:parent).and_return(instance_double(LanternResource))
expect(pg).to receive(:logical_replication).and_return(false)
allow(query_res).to receive(:where).and_return(query_res)
expect(project).to receive(:lantern_resources_dataset).and_return(query_res)

post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/switchover"
expect(last_response.status).to eq(400)
expect(last_response.status).to eq(422)
expect(JSON.parse(last_response.body)["error"]).to eq(err)
end

it "performs switchover" do
expect(Project).to receive(:from_ubid).and_return(project).at_least(:once)
query_res = class_double(LanternResource, first: pg)
expect(pg).to receive(:parent).and_return(instance_double(LanternResource))
expect(pg).to receive(:logical_replication).and_return(true)
allow(query_res).to receive(:where).and_return(query_res)
expect(project).to receive(:lantern_resources_dataset).and_return(query_res)
expect(pg).to receive(:incr_switchover_with_parent)
expect(pg).to receive(:prepare_switchover)

post "/api/project/#{project.ubid}/location/#{pg.location}/lantern/instance-1/switchover"
expect(last_response.status).to eq(200)
Expand Down

0 comments on commit 9ec830c

Please sign in to comment.