Install Descheduler, fix startup readywait

Descheduler will be used for eve-app rebalancing during cluster node reboots/upgrades in an upcoming PR. Wait for longhorn daemonsets to be ready, before upcoming PR to snapshot single-node /var/lib kube db. Resolve sometimes failure to import external-boot-image Wait for containerd before importing. Tighter error checking on import. Signed-off-by: Andrew Durbin <andrewd@zededa.com>
lf-edge · Oct 21, 2024 · 53d0c59 · 53d0c59
1 parent cc44ccf
commit 53d0c59
Show file tree

Hide file tree

Showing 9 changed files with 4,866 additions and 58 deletions.
diff --git a/.spdxignore b/.spdxignore
@@ -10,3 +10,7 @@ pkg/rngd/cmd/rngd/vendor/
 pkg/wwan/mmagent/vendor/
 tools/get-deps/vendor/
 pkg/installer/vendor/
+pkg/kube/descheduler-job.yaml
+pkg/kube/descheduler-policy-configmap.yaml
+pkg/kube/descheduler_rbac.yaml
+pkg/kube/lh-cfg-v1.6.2.yaml
diff --git a/pkg/kube/Dockerfile b/pkg/kube/Dockerfile
@@ -39,10 +39,18 @@ COPY kubevirt-features.yaml /etc
 COPY external-boot-image.tar /etc/
 
 # Longhorn config
+COPY longhorn-utils.sh /usr/bin/
+COPY lh-cfg-v1.6.2.yaml /etc/
 COPY iscsid.conf /etc/iscsi/
 COPY longhorn-generate-support-bundle.sh /usr/bin/
 COPY nsmounter /usr/bin/
 
+# descheduler
+COPY descheduler-utils.sh /usr/bin/
+COPY descheduler_rbac.yaml /etc/
+COPY descheduler-job.yaml /etc/
+COPY descheduler-policy-configmap.yaml /etc/
+
 # Containerd config
 RUN mkdir -p /etc/containerd
 COPY config-k3s.toml /etc/containerd/

diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh
@@ -5,7 +5,6 @@
 
 K3S_VERSION=v1.28.5+k3s1
 KUBEVIRT_VERSION=v1.1.0
-LONGHORN_VERSION=v1.6.2
 CDI_VERSION=v1.54.0
 NODE_IP=""
 MAX_K3S_RESTARTS=10
@@ -18,6 +17,11 @@ HOSTNAME=""
 VMICONFIG_FILENAME="/run/zedkube/vmiVNC.run"
 VNC_RUNNING=false
 
+# shellcheck source=pkg/kube/descheduler-utils.sh
+. /usr/bin/descheduler-utils.sh
+# shellcheck source=pkg/kube/longhorn-utils.sh
+. /usr/bin/longhorn-utils.sh
+
 logmsg() {
         local MSG
         local TIME
@@ -220,40 +224,6 @@ config_cluster_roles() {
         touch /var/lib/debuguser-initialized
 }
 
-apply_longhorn_disk_config() {
-        node=$1
-        kubectl label node "$node" node.longhorn.io/create-default-disk='config'
-        kubectl annotate node "$node" node.longhorn.io/default-disks-config='[ { "path":"/persist/vault/volumes", "allowScheduling":true }]'
-}
-
-check_overwrite_nsmounter() {
-        ### REMOVE ME+
-        # When https://github.com/longhorn/longhorn/issues/6857 is resolved, remove this 'REMOVE ME' section
-        # In addition to pkg/kube/nsmounter and the copy of it in pkg/kube/Dockerfile
-        longhornCsiPluginPods=$(kubectl -n longhorn-system get pod -o json | jq -r '.items[] | select(.metadata.labels.app=="longhorn-csi-plugin" and .status.phase=="Running") | .metadata.name')
-        for csiPod in $longhornCsiPluginPods; do
-                if ! kubectl -n longhorn-system exec "pod/${csiPod}" --container=longhorn-csi-plugin -- ls /usr/local/sbin/nsmounter.updated > /dev/null 2>@1; then
-                        if kubectl -n longhorn-system exec -i "pod/${csiPod}" --container=longhorn-csi-plugin -- tee /usr/local/sbin/nsmounter < /usr/bin/nsmounter; then
-                                logmsg "Updated nsmounter in longhorn pod ${csiPod}"
-                                kubectl -n longhorn-system exec "pod/${csiPod}" --container=longhorn-csi-plugin -- touch /usr/local/sbin/nsmounter.updated
-                        fi
-                fi
-        done
-        ### REMOVE ME-
-}
-
-# A spot to do persistent configuration of longhorn
-# These are applied once per cluster
-longhorn_post_install_config() {
-        # Wait for longhorn objects to be available before patching them
-        lhSettingsAvailable=$(kubectl -n longhorn-system get settings -o json | jq '.items | length>0')
-        if [ "$lhSettingsAvailable" != "true" ]; then
-                return
-        fi
-        kubectl  -n longhorn-system patch settings.longhorn.io/upgrade-checker -p '[{"op":"replace","path":"/value","value":"false"}]' --type json
-        touch /var/lib/longhorn_configured
-}
-
 check_start_k3s() {
   pgrep -f "k3s server" > /dev/null 2>&1
   if [ $? -eq 1 ]; then
@@ -300,20 +270,30 @@ check_start_containerd() {
                 logmsg "Started k3s-containerd at pid:$containerd_pid"
         fi
         if [ -f /etc/external-boot-image.tar ]; then
+                # Is containerd up?
+                if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock info > /dev/null 2>&1; then
+                        logmsg "k3s-containerd not yet running for image import"
+                        return
+                fi
+
                 # NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/
                 # Install external-boot-image image to our eve user containerd registry.
                 # This image contains just kernel and initrd to bootstrap a container image as a VM.
                 # This is very similar to what we do on kvm based eve to start container as a VM.
                 logmsg "Trying to install new external-boot-image"
                 # This import happens once per reboot
-                if ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then
-                        eve_external_boot_img_tag=$(cat /run/eve-release)
-                        eve_external_boot_img=docker.io/lfedge/eve-external-boot-image:"$eve_external_boot_img_tag"
-                        import_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]')
-                        ctr -a /run/containerd-user/containerd.sock image tag "$import_tag" "$eve_external_boot_img"
-
-                        logmsg "Successfully installed external-boot-image $import_tag as $eve_external_boot_img"
-                        rm -f /etc/external-boot-image.tar
+                import_name_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]')
+                import_name=$(echo "$import_name_tag" | cut -d ':' -f 1)
+                eve_external_boot_img_name="docker.io/lfedge/eve-external-boot-image"
+                if [ "$import_name" = "$eve_external_boot_img_name" ]; then
+                        if /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then
+                                eve_external_boot_img_tag=$(cat /run/eve-release)
+                                eve_external_boot_img="${eve_external_boot_img_name}:${eve_external_boot_img_tag}"
+                                if /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image tag "$import_name_tag" "$eve_external_boot_img"; then
+                                        logmsg "Successfully installed external-boot-image $import_name_tag as $eve_external_boot_img"
+                                        rm -f /etc/external-boot-image.tar
+                                fi
+                        fi
                 fi
         fi
 }
@@ -497,22 +477,30 @@ if [ ! -f /var/lib/all_components_initialized ]; then
                 continue
         fi
 
-        if [ ! -f /var/lib/longhorn_initialized ]; then
-                wait_for_item "longhorn"
-                logmsg "Installing longhorn version ${LONGHORN_VERSION}"
-                apply_longhorn_disk_config "$HOSTNAME"
-                lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml
-                if [ ! -e $lhCfgPath ]; then
-                        curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath"
-                fi
-                if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then
-                        sed -i '/  default-setting.yaml: |-/a\    create-default-disk-labeled-nodes: true' "$lhCfgPath"
-                fi
-                kubectl apply -f "$lhCfgPath"
-                touch /var/lib/longhorn_initialized
+        #
+        # Longhorn
+        #
+        wait_for_item "longhorn"
+        if ! longhorn_install "$HOSTNAME"; then
+                continue
+        fi
+        if ! longhorn_is_ready; then
+                # It can take a moment for the new pods to get to ContainerCreating
+                # Just back off until they are caught by the earlier are_all_pods_ready
+                sleep 30
+                continue
+        fi
+        logmsg "longhorn ready"
+
+        #
+        # Descheduler
+        #
+        wait_for_item "descheduler"
+        if ! descheduler_install; then
+                continue
         fi
 
-        if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then
+        if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ]; then
                 logmsg "All components initialized"
                 touch /var/lib/all_components_initialized
         fi
@@ -541,7 +529,7 @@ else
                         cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml
                 fi
         else
-                if [ -e /var/lib/longhorn_initialized ]; then
+                if longhorn_is_ready; then
                         check_overwrite_nsmounter
                 fi
                 if [ ! -e /var/lib/longhorn_configured ]; then

diff --git a/pkg/kube/descheduler-job.yaml b/pkg/kube/descheduler-job.yaml
@@ -0,0 +1,54 @@
+---
+# from: https://raw.githubusercontent.com/kubernetes-sigs/descheduler/${DESCHEDULER_VERSION}/kubernetes/job/job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: descheduler-job
+  namespace: kube-system
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: descheduler-pod
+    spec:
+      priorityClassName: system-cluster-critical
+      containers:
+        - name: descheduler
+          image: registry.k8s.io/descheduler/descheduler:v0.29.0
+          volumeMounts:
+            - mountPath: /policy-dir
+              name: policy-volume
+          command:
+            - "/bin/descheduler"
+          args:
+            - "--policy-config-file"
+            - "/policy-dir/policy.yaml"
+            - "--v"
+            - "3"
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "256Mi"
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /healthz
+              port: 10258
+              scheme: HTTPS
+            initialDelaySeconds: 3
+            periodSeconds: 10
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+            privileged: false
+            readOnlyRootFilesystem: true
+            runAsNonRoot: true
+      restartPolicy: "Never"
+      serviceAccountName: descheduler-sa
+      volumes:
+        - name: policy-volume
+          configMap:
+            name: descheduler-policy-configmap
diff --git a/pkg/kube/descheduler-policy-configmap.yaml b/pkg/kube/descheduler-policy-configmap.yaml
@@ -0,0 +1,24 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: descheduler-policy-configmap
+  namespace: kube-system
+data:
+  policy.yaml: |
+    apiVersion: "descheduler/v1alpha2"
+    kind: "DeschedulerPolicy"
+    profiles:
+      - name: ProfileName
+        pluginConfig:
+        - name: "RemovePodsViolatingNodeAffinity"
+          args:
+            namespaces:
+              include:
+              - "eve-kube-app"
+            nodeAffinityType:
+            - "preferredDuringSchedulingIgnoredDuringExecution"
+        plugins:
+          deschedule:
+            enabled:
+              - "RemovePodsViolatingNodeAffinity"
diff --git a/pkg/kube/descheduler-utils.sh b/pkg/kube/descheduler-utils.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+# Copyright (c) 2024 Zededa, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+DESCHEDULER_VERSION="v0.29.0"
+
+descheduler_install()
+{
+    logmsg "Applying Descheduler ${DESCHEDULER_VERSION}"
+    if ! kubectl apply -f /etc/descheduler_rbac.yaml; then
+            logmsg "descheduler rbac not yet applied"
+            return 1
+    fi
+    if ! kubectl apply -f /etc/descheduler-policy-configmap.yaml; then
+            logmsg "descheduler configmap not yet applied"
+            return 1
+    fi
+    return 0
+}
diff --git a/pkg/kube/descheduler_rbac.yaml b/pkg/kube/descheduler_rbac.yaml
@@ -0,0 +1,50 @@
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: descheduler-cluster-role
+rules:
+- apiGroups: ["events.k8s.io"]
+  resources: ["events"]
+  verbs: ["create", "update"]
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["namespaces"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list", "delete"]
+- apiGroups: [""]
+  resources: ["pods/eviction"]
+  verbs: ["create"]
+- apiGroups: ["scheduling.k8s.io"]
+  resources: ["priorityclasses"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["coordination.k8s.io"]
+  resources: ["leases"]
+  verbs: ["create"]
+- apiGroups: ["coordination.k8s.io"]
+  resources: ["leases"]
+  resourceNames: ["descheduler"]
+  verbs: ["get", "patch", "delete"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: descheduler-sa
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: descheduler-cluster-role-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: descheduler-cluster-role
+subjects:
+  - name: descheduler-sa
+    kind: ServiceAccount
+    namespace: kube-system