Skip to content

Commit

Permalink
add alert if latest wal record is missing in bucket
Browse files Browse the repository at this point in the history
  • Loading branch information
var77 committed Jun 20, 2024
1 parent bf19191 commit 3f80908
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 0 deletions.
7 changes: 7 additions & 0 deletions model/lantern/lantern_timeline.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def backups
.list_objects(Config.lantern_backup_bucket, "#{ubid}/basebackups_005/*_backup_stop_sentinel.json")
end

def last_checkpoint_file_exists?
redo_wal_file = leader.run_query("SELECT redo_wal_file FROM pg_control_checkpoint() WHERE checkpoint_time < NOW() - interval '2 minute'").chomp.strip
return true if redo_wal_file.empty? # file is new it may need some time to sync into storage

!blob_storage_client.list_objects(Config.lantern_backup_bucket, "#{ubid}/wal_005/#{redo_wal_file}.lz4").empty?
end

def backups_with_metadata
storage_client = blob_storage_client
mutex = Mutex.new
Expand Down
6 changes: 6 additions & 0 deletions prog/lantern/lantern_timeline_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def before_run
Page.from_tag_parts("MissingBackup", lantern_timeline.id)&.incr_resolve
end

if !lantern_timeline.last_checkpoint_file_exists?
Prog::PageNexus.assemble("Missing WAL file at #{lantern_timeline}!", [lantern_timeline.ubid], "MissingWALFile", lantern_timeline.id)
else
Page.from_tag_parts("MissingWALFile", lantern_timeline.id)&.incr_resolve
end

nap 20 * 60
end

Expand Down
31 changes: 31 additions & 0 deletions spec/model/lantern/lantern_timeline_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -249,4 +249,35 @@
expect(lantern_timeline.need_cleanup?).to be(true)
end
end

describe "#last_checkpoint_file_exists?" do
it "returns empty from query" do
leader = instance_double(LanternServer)
expect(lantern_timeline).to receive(:leader).and_return(leader).at_least(:once)
expect(leader).to receive(:run_query).and_return("")
expect(lantern_timeline.last_checkpoint_file_exists?).to be(true)
end

it "returns no backups" do
leader = instance_double(LanternServer)
expect(lantern_timeline).to receive(:leader).and_return(leader).at_least(:once)
expect(leader).to receive(:run_query).and_return("000000010000000000000072")
gcp_api = instance_double(Hosting::GcpApis)
expect(gcp_api).to receive(:list_objects).with(Config.lantern_backup_bucket, "pvr1mcnhzd8p0qwwa00tr5cvex/wal_005/000000010000000000000072.lz4").and_return([])
expect(lantern_timeline).to receive(:blob_storage_client).and_return(gcp_api)

expect(lantern_timeline.last_checkpoint_file_exists?).to be(false)
end

it "returns backup file" do
leader = instance_double(LanternServer)
expect(lantern_timeline).to receive(:leader).and_return(leader).at_least(:once)
expect(leader).to receive(:run_query).and_return("000000010000000000000072")
gcp_api = instance_double(Hosting::GcpApis)
expect(gcp_api).to receive(:list_objects).with(Config.lantern_backup_bucket, "pvr1mcnhzd8p0qwwa00tr5cvex/wal_005/000000010000000000000072.lz4").and_return([{}])
expect(lantern_timeline).to receive(:blob_storage_client).and_return(gcp_api)

expect(lantern_timeline.last_checkpoint_file_exists?).to be(true)
end
end
end
28 changes: 28 additions & 0 deletions spec/prog/lantern/lantern_timeline_nexus_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
expect(lantern_server.vm.sshable).to receive(:cmd).with(a_string_matching("common/bin/daemonizer 'docker compose -f /var/lib/lantern/docker-compose.yaml exec -T -u root postgresql bash -c"))
expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}])
expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server)
expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true)
expect { nx.wait }.to nap(20 * 60)
end

Expand All @@ -99,6 +100,7 @@
expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 3 * 24 * 60 * 60}])
expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server)
expect(lantern_server.timeline).to receive(:ubid).and_return(lantern_server.timeline.id)
expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true)
expect { nx.wait }.to nap(20 * 60)
expect(Page.first).not_to be_nil
end
Expand All @@ -108,6 +110,7 @@
expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(false)
expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}])
expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server)
expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true)
expect { nx.wait }.to nap(20 * 60)
expect(Page.first).to be_nil
end
Expand All @@ -118,12 +121,37 @@
expect(lantern_server.timeline).to receive(:backups).and_return([])
expect(lantern_server.timeline).to receive(:leader).and_return(nil)
expect(lantern_server.timeline).to receive(:created_at).and_return(Time.now - 1 * 24 * 60 * 60)
expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true)
page = instance_double(Page)
expect(Page).to receive(:from_tag_parts).and_return(page)
expect(Page).to receive(:from_tag_parts).and_return(nil)
expect(page).to receive(:incr_resolve)
expect { nx.wait }.to nap(20 * 60)
expect(Page.first).to be_nil
end

it "creates alert for missing wal file" do
expect(lantern_server.timeline).to receive(:need_backup?).and_return(false)
expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(false)
expect(lantern_server.timeline).to receive(:backups).and_return([{last_modified: Time.now - 1 * 24 * 60 * 60}])
expect(lantern_server.timeline).to receive(:leader).and_return(lantern_server)
expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(false)
expect { nx.wait }.to nap(20 * 60)
expect(Page.from_tag_parts("MissingWALFile", lantern_server.timeline.id)).not_to be_nil
end

it "resolves alert for missing wal file" do
expect(lantern_server.timeline).to receive(:need_backup?).and_return(false)
expect(lantern_server.timeline).to receive(:need_cleanup?).and_return(false)
expect(lantern_server.timeline).to receive(:backups).and_return([])
expect(lantern_server.timeline).to receive(:leader).and_return(nil)
expect(lantern_server.timeline).to receive(:created_at).and_return(Time.now - 1 * 24 * 60 * 60)
expect(lantern_server.timeline).to receive(:last_checkpoint_file_exists?).and_return(true)
page = instance_double(Page)
expect(Page).to receive(:from_tag_parts).and_return(nil, page)
expect(page).to receive(:incr_resolve)
expect { nx.wait }.to nap(20 * 60)
end
end

describe "#take_backup" do
Expand Down

0 comments on commit 3f80908

Please sign in to comment.