From fbe2e54ae9d1df1a2bff1f4efc45b04c7bf6f514 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Wed, 19 Jun 2024 19:11:07 +0400 Subject: [PATCH] add alert if latest wal record is missing in bucket --- model/lantern/lantern_timeline.rb | 7 +++++ prog/lantern/lantern_timeline_nexus.rb | 6 ++++ spec/model/lantern/lantern_timeline_spec.rb | 31 +++++++++++++++++++ .../lantern/lantern_timeline_nexus_spec.rb | 28 +++++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/model/lantern/lantern_timeline.rb b/model/lantern/lantern_timeline.rb index 7bca80c33..6f81a2005 100644 --- a/model/lantern/lantern_timeline.rb +++ b/model/lantern/lantern_timeline.rb @@ -82,6 +82,13 @@ def backups .list_objects(Config.lantern_backup_bucket, "#{ubid}/basebackups_005/*_backup_stop_sentinel.json") end + def last_checkpoint_file_exists? + redo_wal_file = leader.run_query("SELECT redo_wal_file FROM pg_control_checkpoint() WHERE checkpoint_time < NOW() - interval '2 minute'").chomp.strip + return true if redo_wal_file.empty? # file is new it may need some time to sync into storage + + !blob_storage_client.list_objects(Config.lantern_backup_bucket, "#{ubid}/wal_005/#{redo_wal_file}.lz4").empty? + end + def backups_with_metadata storage_client = blob_storage_client mutex = Mutex.new diff --git a/prog/lantern/lantern_timeline_nexus.rb b/prog/lantern/lantern_timeline_nexus.rb index b94a1dd10..d1d529ac3 100644 --- a/prog/lantern/lantern_timeline_nexus.rb +++ b/prog/lantern/lantern_timeline_nexus.rb @@ -64,6 +64,12 @@ def before_run Page.from_tag_parts("MissingBackup", lantern_timeline.id)&.incr_resolve end + if !lantern_timeline.last_checkpoint_file_exists? + Prog::PageNexus.assemble("Missing WAL file at #{lantern_timeline}!", [lantern_timeline.ubid], "MissingWALFile", lantern_timeline.id) + else + Page.from_tag_parts("MissingWALFile", lantern_timeline.id)&.incr_resolve + end + nap 20 * 60 end diff --git a/spec/model/lantern/lantern_timeline_spec.rb b/spec/model/lantern/lantern_timeline_spec.rb index 37fa6ea5d..0e24865db 100644 --- a/spec/model/lantern/lantern_timeline_spec.rb +++ b/spec/model/lantern/lantern_timeline_spec.rb @@ -249,4 +249,35 @@ expect(lantern_timeline.need_cleanup?).to be(true) end end + + describe "#last_checkpoint_file_exists?" do + it "returns empty from query" do + leader = instance_double(LanternServer) + expect(lantern_timeline).to receive(:leader).and_return(leader).at_least(:once) + expect(leader).to receive(:run_query).and_return("") + expect(lantern_timeline.last_checkpoint_file_exists?).to be(true) + end + + it "returns no backups" do + leader = instance_double(LanternServer) + expect(lantern_timeline).to receive(:leader).and_return(leader).at_least(:once) + expect(leader).to receive(:run_query).and_return("000000010000000000000072") + gcp_api = instance_double(Hosting::GcpApis) + expect(gcp_api).to receive(:list_objects).with(Config.lantern_backup_bucket, "pvr1mcnhzd8p0qwwa00tr5cvex/wal_005/000000010000000000000072.lz4").and_return([]) + expect(lantern_timeline).to receive(:blob_storage_client).and_return(gcp_api) + + expect(lantern_timeline.last_checkpoint_file_exists?).to be(false) + end + + it "returns backup file" do + leader = instance_double(LanternServer) + expect(lantern_timeline).to receive(:leader).and_return(leader).at_least(:once) + expect(leader).to receive(:run_query).and_return("000000010000000000000072") + gcp_api = instance_double(Hosting::GcpApis) + expect(gcp_api).to receive(:list_objects).with(Config.lantern_backup_bucket, "pvr1mcnhzd8p0qwwa00tr5cvex/wal_005/000000010000000000000072.lz4").and_return([{}]) + expect(lantern_timeline).to receive(:blob_storage_client).and_return(gcp_api) + + expect(lantern_timeline.last_checkpoint_file_exists?).to be(true) + end + end end diff --git a/spec/prog/lantern/lantern_timeline_nexus_spec.rb b/spec/prog/lantern/lantern_timeline_nexus_spec.rb index 8990b1b94..bd2866733 100644 --- a/spec/prog/lantern/lantern_timeline_nexus_spec.rb +++ b/spec/prog/lantern/lantern_timeline_nexus_spec.rb @@ -90,6 +90,7 @@ expect(lantern_server.vm.sshable).to receive(:cmd).with(a_string_matching("common/bin/daemonizer 'docker compose -f /var/lib/lantern/docker-compose.yaml exec -T -u root postgresql bash -c")) expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}]) expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server) + expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true) expect { nx.wait }.to nap(20 * 60) end @@ -99,6 +100,7 @@ expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 3 * 24 * 60 * 60}]) expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server) expect(lantern_server.timeline).to receive(:ubid).and_return(lantern_server.timeline.id) + expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true) expect { nx.wait }.to nap(20 * 60) expect(Page.first).not_to be_nil end @@ -108,6 +110,7 @@ expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(false) expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}]) expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server) + expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true) expect { nx.wait }.to nap(20 * 60) expect(Page.first).to be_nil end @@ -118,12 +121,37 @@ expect(lantern_server.timeline).to receive(:backups).and_return([]) expect(lantern_server.timeline).to receive(:leader).and_return(nil) expect(lantern_server.timeline).to receive(:created_at).and_return(Time.now - 1 * 24 * 60 * 60) + expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true) page = instance_double(Page) expect(Page).to receive(:from_tag_parts).and_return(page) + expect(Page).to receive(:from_tag_parts).and_return(nil) expect(page).to receive(:incr_resolve) expect { nx.wait }.to nap(20 * 60) expect(Page.first).to be_nil end + + it "creates alert for missing wal file" do + expect(lantern_server.timeline).to receive(:need_backup?).and_return(false) + expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(false) + expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}]) + expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server) + expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(false) + expect { nx.wait }.to nap(20 * 60) + expect(Page.from_tag_parts("MissingWALFile", lantern_server.timeline.id)).not_to be_nil + end + + it "resolves alert for missing wal file" do + expect(lantern_server.timeline).to receive(:need_backup?).and_return(false) + expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(false) + expect(lantern_server.timeline).to receive(:backups).and_return([]) + expect(lantern_server.timeline).to receive(:leader).and_return(nil) + expect(lantern_server.timeline).to receive(:created_at).and_return(Time.now - 1 * 24 * 60 * 60) + expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true) + page = instance_double(Page) + expect(Page).to receive(:from_tag_parts).and_return(nil, page) + expect(page).to receive(:incr_resolve) + expect { nx.wait }.to nap(20 * 60) + end end describe "#take_backup" do