Skip to content

Commit

Permalink
Install Descheduler, fix startup readywait
Browse files Browse the repository at this point in the history
Descheduler will be used for eve-app rebalancing during
cluster node reboots/upgrades in an upcoming PR.
Wait for longhorn daemonsets to be ready, before upcoming PR
to snapshot single-node /var/lib kube db.
Resolve sometimes failure to import external-boot-image
	Wait for containerd before importing.
	Tighter error checking on import.

Signed-off-by: Andrew Durbin <andrewd@zededa.com>
  • Loading branch information
andrewd-zededa committed Oct 22, 2024
1 parent cc44ccf commit d5098b7
Show file tree
Hide file tree
Showing 11 changed files with 4,901 additions and 68 deletions.
4 changes: 4 additions & 0 deletions .spdxignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ pkg/rngd/cmd/rngd/vendor/
pkg/wwan/mmagent/vendor/
tools/get-deps/vendor/
pkg/installer/vendor/
pkg/kube/descheduler-job.yaml
pkg/kube/descheduler-policy-configmap.yaml
pkg/kube/descheduler_rbac.yaml
pkg/kube/lh-cfg-v1.6.2.yaml
6 changes: 5 additions & 1 deletion .yamllint
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ extends: default
rules:
line-length:
max: 300
level: warning
level: warning

ignore:
- pkg/kube/lh-cfg-v1.6.2.yaml
- pkg/kube/descheduler_rbac.yaml
2 changes: 2 additions & 0 deletions .yetus-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
^pkg/apparmor/etc/
^eve-tools/bpftrace-compiler/examples/.+\.bt
^pkg/installer/vendor/
^pkg/kube/lh-cfg-v1.6.2.yaml
^pkg/kube/descheduler_rbac.yaml
8 changes: 8 additions & 0 deletions pkg/kube/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,18 @@ COPY kubevirt-features.yaml /etc
COPY external-boot-image.tar /etc/

# Longhorn config
COPY longhorn-utils.sh /usr/bin/
COPY lh-cfg-v1.6.2.yaml /etc/
COPY iscsid.conf /etc/iscsi/
COPY longhorn-generate-support-bundle.sh /usr/bin/
COPY nsmounter /usr/bin/

# descheduler
COPY descheduler-utils.sh /usr/bin/
COPY descheduler_rbac.yaml /etc/
COPY descheduler-job.yaml /etc/
COPY descheduler-policy-configmap.yaml /etc/

# Containerd config
RUN mkdir -p /etc/containerd
COPY config-k3s.toml /etc/containerd/
Expand Down
141 changes: 74 additions & 67 deletions pkg/kube/cluster-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

K3S_VERSION=v1.28.5+k3s1
KUBEVIRT_VERSION=v1.1.0
LONGHORN_VERSION=v1.6.2
CDI_VERSION=v1.54.0
NODE_IP=""
MAX_K3S_RESTARTS=10
Expand All @@ -18,6 +17,11 @@ HOSTNAME=""
VMICONFIG_FILENAME="/run/zedkube/vmiVNC.run"
VNC_RUNNING=false

# shellcheck source=pkg/kube/descheduler-utils.sh
. /usr/bin/descheduler-utils.sh
# shellcheck source=pkg/kube/longhorn-utils.sh
. /usr/bin/longhorn-utils.sh

logmsg() {
local MSG
local TIME
Expand Down Expand Up @@ -220,40 +224,6 @@ config_cluster_roles() {
touch /var/lib/debuguser-initialized
}

apply_longhorn_disk_config() {
node=$1
kubectl label node "$node" node.longhorn.io/create-default-disk='config'
kubectl annotate node "$node" node.longhorn.io/default-disks-config='[ { "path":"/persist/vault/volumes", "allowScheduling":true }]'
}

check_overwrite_nsmounter() {
### REMOVE ME+
# When https://github.com/longhorn/longhorn/issues/6857 is resolved, remove this 'REMOVE ME' section
# In addition to pkg/kube/nsmounter and the copy of it in pkg/kube/Dockerfile
longhornCsiPluginPods=$(kubectl -n longhorn-system get pod -o json | jq -r '.items[] | select(.metadata.labels.app=="longhorn-csi-plugin" and .status.phase=="Running") | .metadata.name')
for csiPod in $longhornCsiPluginPods; do
if ! kubectl -n longhorn-system exec "pod/${csiPod}" --container=longhorn-csi-plugin -- ls /usr/local/sbin/nsmounter.updated > /dev/null 2>@1; then
if kubectl -n longhorn-system exec -i "pod/${csiPod}" --container=longhorn-csi-plugin -- tee /usr/local/sbin/nsmounter < /usr/bin/nsmounter; then
logmsg "Updated nsmounter in longhorn pod ${csiPod}"
kubectl -n longhorn-system exec "pod/${csiPod}" --container=longhorn-csi-plugin -- touch /usr/local/sbin/nsmounter.updated
fi
fi
done
### REMOVE ME-
}

# A spot to do persistent configuration of longhorn
# These are applied once per cluster
longhorn_post_install_config() {
# Wait for longhorn objects to be available before patching them
lhSettingsAvailable=$(kubectl -n longhorn-system get settings -o json | jq '.items | length>0')
if [ "$lhSettingsAvailable" != "true" ]; then
return
fi
kubectl -n longhorn-system patch settings.longhorn.io/upgrade-checker -p '[{"op":"replace","path":"/value","value":"false"}]' --type json
touch /var/lib/longhorn_configured
}

check_start_k3s() {
pgrep -f "k3s server" > /dev/null 2>&1
if [ $? -eq 1 ]; then
Expand Down Expand Up @@ -283,6 +253,51 @@ check_start_k3s() {
return 0
}

external_boot_image_import() {
# NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/
# Install external-boot-image image to our eve user containerd registry.
# This image contains just kernel and initrd to bootstrap a container image as a VM.
# This is very similar to what we do on kvm based eve to start container as a VM.

boot_img_path="/etc/external-boot-image.tar"
if [ ! -f "$boot_img_path" ]; then
return 1
fi

# Is containerd up?
if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock info > /dev/null 2>&1; then
logmsg "k3s-containerd not yet running for image import"
return 1
fi

eve_external_boot_img_name="docker.io/lfedge/eve-external-boot-image"
eve_external_boot_img_tag=$(cat /run/eve-release)
eve_external_boot_img="${eve_external_boot_img_name}:${eve_external_boot_img_tag}"
if /var/lib/k3s/bin/k3s crictl --runtime-endpoint=unix:///run/containerd-user/containerd.sock inspecti "$eve_external_boot_img"; then
# Already imported
return 0
fi

import_name_tag=$(tar -xOf "$boot_img_path" manifest.json | jq -r '.[0].RepoTags[0]')
import_name=$(echo "$import_name_tag" | cut -d ':' -f 1)
if [ "$import_name" != "$eve_external_boot_img_name" ]; then
logmsg "external-boot-image.tar is corrupt"
return 1
fi

if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image import "$boot_img_path"; then
logmsg "import $boot_img_path failed"
return 1
fi

if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image tag "$import_name_tag" "$eve_external_boot_img"; then
logmsg "re-tag external-boot-image failed"
return 1
fi
logmsg "Successfully installed external-boot-image $import_name_tag as $eve_external_boot_img"
return 0
}

check_start_containerd() {
# Needed to get the pods to start
if [ ! -L /usr/bin/runc ]; then
Expand All @@ -299,23 +314,6 @@ check_start_containerd() {
containerd_pid=$!
logmsg "Started k3s-containerd at pid:$containerd_pid"
fi
if [ -f /etc/external-boot-image.tar ]; then
# NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/
# Install external-boot-image image to our eve user containerd registry.
# This image contains just kernel and initrd to bootstrap a container image as a VM.
# This is very similar to what we do on kvm based eve to start container as a VM.
logmsg "Trying to install new external-boot-image"
# This import happens once per reboot
if ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then
eve_external_boot_img_tag=$(cat /run/eve-release)
eve_external_boot_img=docker.io/lfedge/eve-external-boot-image:"$eve_external_boot_img_tag"
import_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]')
ctr -a /run/containerd-user/containerd.sock image tag "$import_tag" "$eve_external_boot_img"

logmsg "Successfully installed external-boot-image $import_tag as $eve_external_boot_img"
rm -f /etc/external-boot-image.tar
fi
fi
}
trigger_k3s_selfextraction() {
# Analysis of the k3s source shows nearly any cli command will first self-extract a series of binaries.
Expand Down Expand Up @@ -440,6 +438,7 @@ if [ ! -f /var/lib/all_components_initialized ]; then
sleep 1

check_start_containerd
external_boot_image_import
if ! check_start_k3s; then
continue
fi
Expand Down Expand Up @@ -497,22 +496,30 @@ if [ ! -f /var/lib/all_components_initialized ]; then
continue
fi

if [ ! -f /var/lib/longhorn_initialized ]; then
wait_for_item "longhorn"
logmsg "Installing longhorn version ${LONGHORN_VERSION}"
apply_longhorn_disk_config "$HOSTNAME"
lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml
if [ ! -e $lhCfgPath ]; then
curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath"
fi
if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then
sed -i '/ default-setting.yaml: |-/a\ create-default-disk-labeled-nodes: true' "$lhCfgPath"
fi
kubectl apply -f "$lhCfgPath"
touch /var/lib/longhorn_initialized
#
# Longhorn
#
wait_for_item "longhorn"
if ! longhorn_install "$HOSTNAME"; then
continue
fi
if ! longhorn_is_ready; then
# It can take a moment for the new pods to get to ContainerCreating
# Just back off until they are caught by the earlier are_all_pods_ready
sleep 30
continue
fi
logmsg "longhorn ready"

#
# Descheduler
#
wait_for_item "descheduler"
if ! descheduler_install; then
continue
fi

if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then
if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ]; then
logmsg "All components initialized"
touch /var/lib/all_components_initialized
fi
Expand Down Expand Up @@ -541,7 +548,7 @@ else
cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml
fi
else
if [ -e /var/lib/longhorn_initialized ]; then
if longhorn_is_ready; then
check_overwrite_nsmounter
fi
if [ ! -e /var/lib/longhorn_configured ]; then
Expand Down
54 changes: 54 additions & 0 deletions pkg/kube/descheduler-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
---
# from: https://raw.githubusercontent.com/kubernetes-sigs/descheduler/${DESCHEDULER_VERSION}/kubernetes/job/job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: descheduler-job
namespace: kube-system
spec:
parallelism: 1
completions: 1
template:
metadata:
name: descheduler-pod
spec:
priorityClassName: system-cluster-critical
containers:
- name: descheduler
image: registry.k8s.io/descheduler/descheduler:v0.29.0
volumeMounts:
- mountPath: /policy-dir
name: policy-volume
command:
- "/bin/descheduler"
args:
- "--policy-config-file"
- "/policy-dir/policy.yaml"
- "--v"
- "3"
resources:
requests:
cpu: "500m"
memory: "256Mi"
livenessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10258
scheme: HTTPS
initialDelaySeconds: 3
periodSeconds: 10
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
runAsNonRoot: true
restartPolicy: "Never"
serviceAccountName: descheduler-sa
volumes:
- name: policy-volume
configMap:
name: descheduler-policy-configmap
24 changes: 24 additions & 0 deletions pkg/kube/descheduler-policy-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: descheduler-policy-configmap
namespace: kube-system
data:
policy.yaml: |
apiVersion: "descheduler/v1alpha2"
kind: "DeschedulerPolicy"
profiles:
- name: ProfileName
pluginConfig:
- name: "RemovePodsViolatingNodeAffinity"
args:
namespaces:
include:
- "eve-kube-app"
nodeAffinityType:
- "preferredDuringSchedulingIgnoredDuringExecution"
plugins:
deschedule:
enabled:
- "RemovePodsViolatingNodeAffinity"
20 changes: 20 additions & 0 deletions pkg/kube/descheduler-utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/sh
#
# Copyright (c) 2024 Zededa, Inc.
# SPDX-License-Identifier: Apache-2.0

DESCHEDULER_VERSION="v0.29.0"

descheduler_install()
{
logmsg "Applying Descheduler ${DESCHEDULER_VERSION}"
if ! kubectl apply -f /etc/descheduler_rbac.yaml; then
logmsg "descheduler rbac not yet applied"
return 1
fi
if ! kubectl apply -f /etc/descheduler-policy-configmap.yaml; then
logmsg "descheduler configmap not yet applied"
return 1
fi
return 0
}
50 changes: 50 additions & 0 deletions pkg/kube/descheduler_rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: descheduler-cluster-role
rules:
- apiGroups: ["events.k8s.io"]
resources: ["events"]
verbs: ["create", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "delete"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: ["scheduling.k8s.io"]
resources: ["priorityclasses"]
verbs: ["get", "watch", "list"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
resourceNames: ["descheduler"]
verbs: ["get", "patch", "delete"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: descheduler-sa
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: descheduler-cluster-role-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: descheduler-cluster-role
subjects:
- name: descheduler-sa
kind: ServiceAccount
namespace: kube-system
Loading

0 comments on commit d5098b7

Please sign in to comment.