feat: implement loadbalancing for api and be

vegaprotocol · Oct 29, 2024 · 134eecd · 134eecd
1 parent 71aa3be
commit 134eecd
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 4 deletions.
diff --git a/roles/vega_caddy_server/defaults/main.yaml b/roles/vega_caddy_server/defaults/main.yaml
@@ -2,8 +2,8 @@
 vega_caddy_server_caddyfile_file_name: ""
 vega_caddy_server_node_home: "/home/vega"
 
-vega_caddy_server_xcaddy_version: 0.3.2
-vega_caddy_server_caddy_version: v2.7.5
+vega_caddy_server_xcaddy_version: 0.4.2
+vega_caddy_server_caddy_version: v2.8.4
 
 vega_caddy_server_node_id: unknown
 
@@ -20,3 +20,11 @@ vega_caddy_server_block_explorer_ui_domains: []
 vega_caddy_server_governance_ui_domains: []
 vega_caddy_server_trading_ui_domains: []
 vega_caddy_server_stats_domains: []
+
+vega_caddy_server_with_api_lb: false
+vega_caddy_server_api_lb_domains: []
+vega_caddy_server_api_lb_upstreams: []
+
+vega_caddy_server_with_block_explorer_lb: false
+vega_caddy_server_block_explorer_lb_domains: []
+vega_caddy_server_block_explorer_lb_upstreams: []
diff --git a/roles/vega_caddy_server/tasks/install-caddy.yaml b/roles/vega_caddy_server/tasks/install-caddy.yaml
@@ -39,14 +39,23 @@
 - name: CaddyserverV2 | Install xcaddy
   ansible.builtin.apt:
     deb: "https://github.com/caddyserver/xcaddy/releases/download/v{{ vega_caddy_server_xcaddy_version }}/xcaddy_{{ vega_caddy_server_xcaddy_version }}_linux_amd64.deb" # noqa: yaml[line-length]
+    update_cache: true
+    state: present
+
+- name: Check if caddy is installed
+  ansible.builtin.command: "caddy version"
+  register: caddy_version
+  changed_when: false
+  failed_when: go_installed_version.rc != 0 and go_installed_version.rc != 127 and go_installed_version.rc != 2
+  check_mode: false
 
 - name: CaddyserverV2 | Compile caddy from sources
   ansible.builtin.command: |
     xcaddy build {{ vega_caddy_server_caddy_version }} \
       --output /usr/bin/caddy
       --with github.com/caddyserver/replace-response
-  args:
-    creates: /usr/bin/caddy
+  when: (caddy_version.rc != 0) or (not vega_caddy_server_caddy_version in caddy_version.stdout)
+  changed_when: (caddy_version.rc != 0) or (not vega_caddy_server_caddy_version in caddy_version.stdout)
   notify: "Restart caddy"
 
 - name: Config | Install caddy systemd files

diff --git a/roles/vega_caddy_server/tasks/main.yaml b/roles/vega_caddy_server/tasks/main.yaml
@@ -71,3 +71,33 @@
     mode: "0644"
   when: vega_caddy_server_with_stats and vega_caddy_server_stats_domains | length > 0
   notify: "Restart caddy"
+
+- name: Enable stats config
+  ansible.builtin.template:
+    src: "etc/caddy/sites/stats.caddy.j2"
+    dest: "/etc/caddy/sites/stats.caddy"
+    owner: "caddy"
+    group: "caddy"
+    mode: "0644"
+  when: vega_caddy_server_with_stats and vega_caddy_server_stats_domains | length > 0
+  notify: "Restart caddy"
+
+- name: Enable api LB config
+  ansible.builtin.template:
+    src: "etc/caddy/sites/api-lb.caddy.j2"
+    dest: "/etc/caddy/sites/api-lb.caddy"
+    owner: "caddy"
+    group: "caddy"
+    mode: "0644"
+  when: vega_caddy_server_with_api_lb and vega_caddy_server_api_lb_domains | length > 0
+  notify: "Restart caddy"
+
+- name: Enable block-explorer LB config
+  ansible.builtin.template:
+    src: "etc/caddy/sites/block-explorer-lb.caddy.j2"
+    dest: "/etc/caddy/sites/block-explorer-lb.caddy"
+    owner: "caddy"
+    group: "caddy"
+    mode: "0644"
+  when: vega_caddy_server_with_block_explorer_lb and vega_caddy_server_block_explorer_lb_domains | length > 0
+  notify: "Restart caddy"
diff --git a/roles/vega_caddy_server/templates/etc/caddy/sites/api-lb.caddy.j2 b/roles/vega_caddy_server/templates/etc/caddy/sites/api-lb.caddy.j2
@@ -0,0 +1,59 @@
+{{- vega_caddy_server_api_lb_domains | map('ansible.builtin.regex_replace', '$', ':443') | join(', ') }} {
+    reverse_proxy {
+{% for upstream in vega_caddy_server_api_lb_upstreams %}
+        to {{ upstream }}
+{% endfor %}
+
+        header_up Host {upstream_hostport}
+        header_down X-LB-Upstream {upstream_hostport}
+
+        lb_policy cookie 26403ec6d537fa31f63e294b44831734 {
+            fallback random_choose 2
+        }
+
+        # how many times to retry selecting available backends for each request if the next available 
+        # host is down. By default, retries are disabled (zero).
+        lb_retries 0
+
+        # a duration value that defines how long to wait between selecting the next host from the pool. 
+        # Default is 250ms. Only relevant when a request to an upstream host fails. Be aware that setting
+        # this to 0 with a non-zero lb_try_duration can cause the CPU to spin if all backends are down 
+        # and latency is very low.
+        lb_try_interval 100ms
+
+        # the port to use for active health checks, if different from the upstream's port.
+        health_port 443
+
+        # the URI path (and optional query) for active health checks.
+        health_uri /health-check
+
+        # a duration value that defines how often to perform active health checks. Default: 30s.
+        health_interval 3s
+
+        # the number of consecutive health checks required before marking the backend as healthy again. 
+        # Default: 1.
+        health_passes 1
+
+        # the number of consecutive health checks required before marking the backend as unhealthy. 
+        # Default: 1.
+        health_fails 2
+
+        # will cause the health check to follow redirects provided by upstream. By default, a redirect response
+        # would cause the health check to count as a fail.
+        health_follow_redirects
+
+        # a duration value that defines how long to wait for a reply before marking the backend as down. 
+        # Default: 5s.
+        health_timeout 1s
+
+        # the HTTP status code to expect from a healthy backend. Can be a 3-digit status code, or a 
+        # status code class ending in xx. For example: 200 (which is the default), or 2xx.
+        health_status 2xx
+
+        # duration value that defines how long to remember a failed request. A duration > 0 enables
+        # passive health checking; the default is 0 (off). A reasonable starting point might be 30s
+        # to balance error rates with responsiveness when bringing an unhealthy upstream back online;
+        # but feel free to experiment to find the right balance for your usecase.
+        fail_duration 30s # probably it needs to be higher because 30s may be not enough for vega to startup and there is no point to check it again
+    }
+}
diff --git a/roles/vega_caddy_server/templates/etc/caddy/sites/block-explorer-lb.caddy.j2 b/roles/vega_caddy_server/templates/etc/caddy/sites/block-explorer-lb.caddy.j2
@@ -0,0 +1,59 @@
+{{- vega_caddy_server_block_explorer_lb_domains | map('ansible.builtin.regex_replace', '$', ':443') | join(', ') }} {
+    reverse_proxy {
+{% for upstream in vega_caddy_server_block_explorer_lb_upstreams %}
+        to {{ upstream }}
+{% endfor %}
+
+        header_up Host {upstream_hostport}
+        header_down X-LB-Upstream {upstream_hostport}
+
+        lb_policy cookie 26403ec6d537fa31f63e294b44831734 {
+            fallback random_choose 2
+        }
+
+        # how many times to retry selecting available backends for each request if the next available 
+        # host is down. By default, retries are disabled (zero).
+        lb_retries 0
+
+        # a duration value that defines how long to wait between selecting the next host from the pool. 
+        # Default is 250ms. Only relevant when a request to an upstream host fails. Be aware that setting
+        # this to 0 with a non-zero lb_try_duration can cause the CPU to spin if all backends are down 
+        # and latency is very low.
+        lb_try_interval 100ms
+
+        # the port to use for active health checks, if different from the upstream's port.
+        health_port 443
+
+        # the URI path (and optional query) for active health checks.
+        health_uri /health-check
+
+        # a duration value that defines how often to perform active health checks. Default: 30s.
+        health_interval 3s
+
+        # the number of consecutive health checks required before marking the backend as healthy again. 
+        # Default: 1.
+        health_passes 1
+
+        # the number of consecutive health checks required before marking the backend as unhealthy. 
+        # Default: 1.
+        health_fails 2
+
+        # will cause the health check to follow redirects provided by upstream. By default, a redirect response
+        # would cause the health check to count as a fail.
+        health_follow_redirects
+
+        # a duration value that defines how long to wait for a reply before marking the backend as down. 
+        # Default: 5s.
+        health_timeout 1s
+
+        # the HTTP status code to expect from a healthy backend. Can be a 3-digit status code, or a 
+        # status code class ending in xx. For example: 200 (which is the default), or 2xx.
+        health_status 2xx
+
+        # duration value that defines how long to remember a failed request. A duration > 0 enables
+        # passive health checking; the default is 0 (off). A reasonable starting point might be 30s
+        # to balance error rates with responsiveness when bringing an unhealthy upstream back online;
+        # but feel free to experiment to find the right balance for your usecase.
+        fail_duration 30s # probably it needs to be higher because 30s may be not enough for vega to startup and there is no point to check it again
+    }
+}