Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Storage improvements #79

Merged
merged 2 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Procfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
web: bundle exec puma -t 5:5 -p ${PORT:-3000} -e ${RACK_ENV:-development}
respirate: bin/respirate
monitor: bin/monitor
12 changes: 11 additions & 1 deletion lib/hosting/gcp_apis.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,22 @@ def create_vm(name, zone, image, ssh_key, user, machine_type, disk_size_gb, labe
boot: true,
deviceName: "#{name}-boot",
initializeParams: {
diskSizeGb: disk_size_gb,
diskSizeGb: 25,
diskType: "projects/#{@project}/zones/#{zone}/diskTypes/pd-ssd",
sourceImage: image
},
mode: "READ_WRITE",
type: "PERSISTENT"
},
{
autoDelete: true,
deviceName: "#{name}-data",
initializeParams: {
diskSizeGb: disk_size_gb,
diskType: "projects/#{@project}/zones/#{zone}/diskTypes/pd-ssd"
},
mode: "READ_WRITE",
type: "PERSISTENT"
}
],
displayDevice: {
Expand Down
10 changes: 10 additions & 0 deletions migrate/20240728_lantern_server_storage_resize.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# frozen_string_literal: true

Sequel.migration do
change do
alter_table(:lantern_server) do
add_column :max_storage_autoresize_gib, Integer, default: 0
end
run "UPDATE lantern_doctor_query SET schedule='*/2 * * * *' WHERE id='09b1b1d1-7095-89b7-8ae4-158e15e11871'"
end
end
2 changes: 1 addition & 1 deletion model/gcp_vm.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class GcpVm < Sequel::Model
include ResourceMethods
include SemaphoreMethods
include DisplayStatusMethods
semaphore :destroy, :start_vm, :stop_vm, :update_storage, :update_size
semaphore :destroy, :start_vm, :stop_vm, :update_storage, :update_size, :resize_data_disk

include Authorization::HyperTagMethods

Expand Down
15 changes: 14 additions & 1 deletion model/lantern/lantern_doctor_page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,26 @@ class LanternDoctorPage < Sequel::Model

def self.create_incident(query, db_name, vm_name, err: "", output: "")
pg = Prog::PageNexus.assemble_with_logs("Healthcheck: #{query.name} failed on #{query.doctor.resource.name} - #{query.doctor.resource.label} (#{db_name} - #{vm_name})", [query.ubid, query.doctor.ubid], {"stderr" => err, "stdout" => output}, query.severity, "LanternDoctorQueryFailed", query.id, db_name, vm_name)
LanternDoctorPage.create_with_id(
doctor_page = LanternDoctorPage.create_with_id(
query_id: query.id,
page_id: pg.id,
status: "new",
db: db_name,
vm_name: vm_name
)
doctor_page.post_incident_action
doctor_page
end

def post_incident_action
case query.name
when "Lantern Server Disk Usage"
query.doctor.resource.servers.each do |server|
if server.max_storage_autoresize_gib > server.target_storage_size_gib && page.details["logs"]&.[]("stdout") =~ /\/dev\/.*usage.*%/
server.autoresize_disk
end
end
end
end

def path
Expand Down
8 changes: 8 additions & 0 deletions model/lantern/lantern_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,14 @@ def list_all_databases
.map { _1.strip }
end

def autoresize_disk
new_storage_size = (target_storage_size_gib * 1.5).clamp(..max_storage_autoresize_gib)
return if new_storage_size < target_storage_size_gib
update(target_storage_size_gib: new_storage_size)
vm.update(storage_size_gib: new_storage_size)
incr_update_storage_size
end

# def failover_target
# nil
# end
Expand Down
40 changes: 32 additions & 8 deletions prog/gcp_vm/nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class Prog::GcpVm::Nexus < Prog::Base
extend Forwardable
def_delegators :gcp_vm

semaphore :destroy, :start_vm, :stop_vm, :update_storage, :update_size
semaphore :destroy, :start_vm, :stop_vm, :update_storage, :update_size, :resize_data_disk

def self.assemble(public_key, project_id, name: nil, size: "n1-standard-2",
unix_user: "lantern", location: "us-central1", boot_image: Config.gcp_default_image,
Expand Down Expand Up @@ -163,6 +163,10 @@ def host
hop_update_storage
end

when_resize_data_disk_set? do
hop_resize_data_disk
end

nap 30
end

Expand Down Expand Up @@ -192,22 +196,42 @@ def host
hop_wait_sshable
end

label def resize_data_disk
decr_resize_data_disk
gcp_vm.sshable.cmd("sudo resize2fs /dev/sdb")
hop_wait
end

label def update_storage
if !gcp_vm.is_stopped?
hop_stop_vm
end
decr_update_storage
gcp_client = Hosting::GcpApis.new
zone = "#{gcp_vm.location}-a"
vm = gcp_client.get_vm(gcp_vm.name, zone)
disk_source = vm["disks"][0]["source"]
gcp_client.resize_vm_disk(zone, disk_source, gcp_vm.storage_size_gib)
boot_disk = vm["disks"][0]
data_disk = vm["disks"].find { !_1["boot"] }

if data_disk.nil? && !gcp_vm.is_stopped?
hop_stop_vm
end

decr_update_storage

disk = data_disk || boot_disk
gcp_client.resize_vm_disk(zone, disk["source"], gcp_vm.storage_size_gib)

if data_disk
incr_resize_data_disk
end

when_update_size_set? do
hop_update_size
end

hop_start_vm
if gcp_vm.is_stopped?
hop_start_vm
end

gcp_vm.update(display_state: "running")
hop_wait
end

label def update_size
Expand Down
3 changes: 2 additions & 1 deletion prog/lantern/lantern_resource_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class Prog::Lantern::LanternResourceNexus < Prog::Base
def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage_size_gib:, ubid: LanternResource.generate_ubid, ha_type: LanternResource::HaType::NONE, parent_id: nil, restore_target: nil, recovery_target_lsn: nil,
org_id: nil, db_name: "postgres", db_user: "postgres", db_user_password: nil, superuser_password: nil, repl_password: nil, app_env: Config.rack_env,
lantern_version: Config.lantern_default_version, extras_version: Config.lantern_extras_default_version, minor_version: Config.lantern_minor_default_version, domain: nil, enable_debug: false,
label: "", version_upgrade: false, logical_replication: false)
label: "", version_upgrade: false, logical_replication: false, max_storage_autoresize_gib: 0)
unless (project = Project[project_id])
fail "No existing project"
end
Expand Down Expand Up @@ -98,6 +98,7 @@ def self.assemble(project_id:, location:, name:, target_vm_size:, target_storage
target_storage_size_gib: target_storage_size_gib,
timeline_id: timeline_id,
timeline_access: timeline_access,
max_storage_autoresize_gib: max_storage_autoresize_gib,
representative_at: Time.now
)

Expand Down
6 changes: 4 additions & 2 deletions prog/lantern/lantern_server_nexus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ class Prog::Lantern::LanternServerNexus < Prog::Base

def self.assemble(
resource_id: nil, lantern_version: "0.2.2", extras_version: "0.1.4", minor_version: "1", domain: nil,
timeline_access: "push", representative_at: nil, target_vm_size: nil, target_storage_size_gib: 50, timeline_id: nil
timeline_access: "push", representative_at: nil, target_vm_size: nil, target_storage_size_gib: 50, timeline_id: nil,
max_storage_autoresize_gib: 0
)

DB.transaction do
Expand Down Expand Up @@ -48,7 +49,8 @@ def self.assemble(
timeline_access: timeline_access,
timeline_id: timeline_id,
representative_at: representative_at,
synchronization_status: representative_at ? "ready" : "catching_up"
synchronization_status: representative_at ? "ready" : "catching_up",
max_storage_autoresize_gib: max_storage_autoresize_gib
)

stack_frame = domain.nil? ? {} : {domain: domain}
Expand Down
15 changes: 14 additions & 1 deletion rhizome/lantern/bin/configure
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,33 @@ end
def setup_memory_limits
r "sudo sh -c 'echo vm.overcommit_ratio=100 >> /etc/sysctl.d/99-lantern.conf'"
r "sudo sh -c 'echo vm.overcommit_memory=2 >> /etc/sysctl.d/99-lantern.conf'"
r "sudo sh -c 'echo vm.swappiness=10 >> /etc/sysctl.d/99-lantern.conf'"
r "sudo sysctl -p /etc/sysctl.d/99-lantern.conf"
end

def setup_fs
# Setup FS
r "sudo mkdir -p #{$datadir}"
r "sudo chown -R 1001:1001 #{$datadir}"
# Directory for configuration
r "sudo mkdir -p #{$workdir}"

# Setup core pattern
if $configure_hash["enable_coredumps"]
r "sudo su -c 'echo \"/bitnami/postgresql/data/core.%e.%p.%t\" > /proc/sys/kernel/core_pattern'"
end

# Add swap
r "sudo fallocate -l 1G /swapfile"
r "sudo chmod 600 /swapfile"
r "sudo mkswap /swapfile"
r "sudo swapon /swapfile"

# Mount data volume
r "sudo mkfs.ext4 /dev/sdb"
r "sudo mount /dev/sdb #{$datadir}"
r "sudo su -c \"echo '/dev/sdb #{$datadir} ext4 defaults 0 0' >> /etc/fstab\""

r "sudo chown -R 1001:1001 #{$datadir}"
end

def setup_env
Expand Down
40 changes: 34 additions & 6 deletions rhizome/lantern/bin/doctor/run_query
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,41 @@ SQL
failed ? "t" : "f"
end

def self.check_disk_space_usage(_db, _query_user)
# return usage percent or 0 if device does not exist
def self.usage_percent(device_name, subtract_tmp)
total = (r "df | awk '$1 == \"#{device_name}\" {print $2}'").chomp.strip.to_i

return 0 if total == 0 # the device does not exist

fs_reserved_space = (total * 0.05).clamp(..5 * 1024 * 1024)
effective_total_space = total - fs_reserved_space
used = (r "df | awk '$1 == \"#{device_name}\" {print $3}'").chomp.strip.to_i
pg_tmp = if subtract_tmp
(r "(du -s #{$datadir}/data/base/pgsql_tmp/ 2>/dev/null || echo \"0\" )| awk '{print $1}'").chomp.strip.to_i
else
0
end

available = effective_total_space - (used - pg_tmp)

100 - (available.to_f / effective_total_space * 100).to_i
end

def self.check_disk_space_usage(db, query_user)
server_type = $configure_hash["server_type"]
output = ""
usage_percent = r("df | awk '$1 == \"/dev/root\" {print $5}' | sed 's/%//'").chomp.strip.to_i
if usage_percent > 90
output += "#{server_type} server - usage #{usage_percent}%\n"
end

["/dev/root", "/dev/sdb"].each {
usage_percent = self.usage_percent _1, _1 == "/dev/sdb"
if usage_percent > 90
output += "#{server_type} server (#{_1}) - usage #{usage_percent}%\n"

if _1 == "/dev/root"
remove_dangling_images(db, query_user)
end
end
}

output.chomp
end

Expand Down Expand Up @@ -148,7 +176,7 @@ SQL
end

def self.remove_dangling_index_files(_db, _query_user)
r("sudo find /var/lib/lantern-data/data/ -name 'ldb-index*' -type f -mmin +240 -delete >/tmp/ldb-index-cleanup-logs 2>&1 && echo "" || cat /tmp/ldb-index-cleanup-logs").chomp.strip
r("sudo find /var/lib/lantern-data/data/ -name 'ldb-index*' -type f -mmin +240 -delete >/tmp/ldb-index-cleanup-logs 2>&1 && echo '' || cat /tmp/ldb-index-cleanup-logs").chomp.strip
end
end

Expand Down
1 change: 1 addition & 0 deletions routes/api/project/lantern.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class CloverApi
org_id: r.params["org_id"].to_i,
target_vm_size: parsed_size.vm_size,
target_storage_size_gib: r.params["storage_size_gib"] || parsed_size.storage_size_gib,
max_storage_autoresize_gib: r.params["max_storage_autoresize_gib"].to_i,
lantern_version: r.params["lantern_version"],
extras_version: r.params["extras_version"],
minor_version: r.params["minor_version"],
Expand Down
1 change: 1 addition & 0 deletions routes/web/project/lantern.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class CloverWeb
org_id: r.params["org_id"],
target_vm_size: parsed_size.vm_size,
target_storage_size_gib: parsed_size.storage_size_gib,
max_storage_autoresize_gib: r.params["max_storage_autoresize_gib"].to_i,
lantern_version: r.params["lantern_version"],
extras_version: r.params["extras_version"],
minor_version: r.params["minor_version"],
Expand Down
1 change: 1 addition & 0 deletions serializers/api/lantern.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def self.base(pg)
minor_version: _1.minor_version,
vm_size: _1.target_vm_size,
storage_size_gib: _1.target_storage_size_gib,
max_storage_autoresize_gib: _1.max_storage_autoresize_gib,
connection_string: _1.connection_string
}
}
Expand Down
1 change: 1 addition & 0 deletions serializers/web/lantern.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def self.base(pg)
minor_version: _1.minor_version,
vm_size: _1.target_vm_size,
storage_size_gib: _1.target_storage_size_gib,
max_storage_autoresize_gib: _1.max_storage_autoresize_gib,
connection_string: _1.connection_string
}
}
Expand Down
55 changes: 55 additions & 0 deletions spec/model/lantern/lantern_doctor_page_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,65 @@
expect(Prog::PageNexus).to receive(:assemble_with_logs).with("Healthcheck: #{query.name} failed on #{query.doctor.resource.name} - #{query.doctor.resource.label} (#{db_name} - #{vm_name})", [query.ubid, query.doctor.ubid], {"stderr" => err, "stdout" => output}, query.severity, "LanternDoctorQueryFailed", query.id, db_name, vm_name).and_return(instance_double(Page, id: "test-pg-id"))
doctor_page = instance_double(described_class)
expect(described_class).to receive(:create_with_id).with(query_id: query.id, page_id: "test-pg-id", status: "new", db: db_name, vm_name: vm_name).and_return(doctor_page)
expect(doctor_page).to receive(:post_incident_action)
expect(described_class.create_incident(query, db_name, vm_name, err: err, output: output)).to be(doctor_page)
end
end

describe "#post_incident_action" do
it "auto resizes disk" do
server = instance_double(LanternServer, max_storage_autoresize_gib: 100, target_storage_size_gib: 50)
expect(server).to receive(:autoresize_disk)
resource = instance_double(LanternResource, servers: [server])
doctor = instance_double(LanternDoctor, resource: resource)
page = instance_double(Page, details: {"logs" => {"stdout" => "/dev/sdb disk usage is 95%"}})
query = instance_double(LanternDoctorQuery, ubid: "test", id: "test-id", severity: "error", name: "Lantern Server Disk Usage", doctor: doctor)
expect(lantern_doctor_page).to receive(:query).and_return(query).at_least(:once)
expect(lantern_doctor_page).to receive(:page).and_return(page)
expect { lantern_doctor_page.post_incident_action }.not_to raise_error
end

it "does not resizes disk if no logs" do
server = instance_double(LanternServer, max_storage_autoresize_gib: 100, target_storage_size_gib: 50)
resource = instance_double(LanternResource, servers: [server])
doctor = instance_double(LanternDoctor, resource: resource)
page = instance_double(Page, details: {})
query = instance_double(LanternDoctorQuery, ubid: "test", id: "test-id", severity: "error", name: "Lantern Server Disk Usage", doctor: doctor)
expect(lantern_doctor_page).to receive(:query).and_return(query).at_least(:once)
expect(lantern_doctor_page).to receive(:page).and_return(page)
expect(server).not_to receive(:autoresize_disk)
expect { lantern_doctor_page.post_incident_action }.not_to raise_error
end

it "does not resizes disk if failed" do
server = instance_double(LanternServer, max_storage_autoresize_gib: 100, target_storage_size_gib: 50)
resource = instance_double(LanternResource, servers: [server])
doctor = instance_double(LanternDoctor, resource: resource)
page = instance_double(Page, details: {"logs" => {"stdout" => "error", "stderr" => "err"}})
query = instance_double(LanternDoctorQuery, ubid: "test", id: "test-id", severity: "error", name: "Lantern Server Disk Usage", doctor: doctor)
expect(lantern_doctor_page).to receive(:query).and_return(query).at_least(:once)
expect(lantern_doctor_page).to receive(:page).and_return(page)
expect(server).not_to receive(:autoresize_disk)
expect { lantern_doctor_page.post_incident_action }.not_to raise_error
end

it "does not resize disk" do
server = instance_double(LanternServer, max_storage_autoresize_gib: 100, target_storage_size_gib: 500)
resource = instance_double(LanternResource, servers: [server])
doctor = instance_double(LanternDoctor, resource: resource)
query = instance_double(LanternDoctorQuery, ubid: "test", id: "test-id", severity: "error", name: "Lantern Server Disk Usage", doctor: doctor)
expect(server).not_to receive(:autoresize_disk)
expect(lantern_doctor_page).to receive(:query).and_return(query).at_least(:once)
expect { lantern_doctor_page.post_incident_action }.not_to raise_error
end

it "does nothing" do
query = instance_double(LanternDoctorQuery, ubid: "test", id: "test-id", severity: "error", name: "test")
expect(lantern_doctor_page).to receive(:query).and_return(query).at_least(:once)
expect { lantern_doctor_page.post_incident_action }.not_to raise_error
end
end

describe "#properties (logs)" do
it "returns sterr and stdout from logs" do
expect(lantern_doctor_page).to receive(:page).and_return(pg).at_least(:once)
Expand Down
Loading