diff --git a/Makefile b/Makefile index 15442735f..477976712 100644 --- a/Makefile +++ b/Makefile @@ -186,7 +186,7 @@ wait-and-get-secret: $(KUBECTL) get secrets $(CLUSTER_NAME)-kubeconfig -o json | jq -r .data.value | base64 --decode > $(WORKER_CLUSTER_KUBECONFIG) ${TIMEOUT} 15m bash -c "while ! $(KUBECTL) --kubeconfig=$(WORKER_CLUSTER_KUBECONFIG) get nodes | grep control-plane; do sleep 1; done" -install-cilium-in-wl-cluster: +install-cilium-in-wl-cluster: $(HELM) # Deploy cilium $(HELM) repo add cilium https://helm.cilium.io/ $(HELM) repo update cilium diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index b3fb48a86..c942e0f29 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -173,6 +173,13 @@ const ( ServerNotFoundReason = "ServerNotFound" ) +const ( + // SSHAfterInstallImageSucceededCondition indicates that the host is reachable via ssh after installImage. + SSHAfterInstallImageSucceededCondition clusterv1.ConditionType = "SSHAfterInstallImageSucceeded" + + // SSHAfterInstallImageFailedReason indicates that the host was not reachable via ssh. + SSHAfterInstallImageFailedReason = "SSHAfterInstallImageFailed" +) const ( // HostAssociateSucceededCondition indicates that a host has been associated. HostAssociateSucceededCondition clusterv1.ConditionType = "HostAssociateSucceeded" diff --git a/api/v1beta1/hetznerbaremetalmachine_types.go b/api/v1beta1/hetznerbaremetalmachine_types.go index fa31eb3ce..055c0fe3a 100644 --- a/api/v1beta1/hetznerbaremetalmachine_types.go +++ b/api/v1beta1/hetznerbaremetalmachine_types.go @@ -316,6 +316,9 @@ func (bmMachine *HetznerBareMetalMachine) SetFailure(reason capierrors.MachineSt // GetImageSuffix tests whether the suffix is known and outputs it if yes. Otherwise it returns an error. func GetImageSuffix(url string) (string, error) { + if strings.HasPrefix(url, "oci://") { + return "tar.gz", nil + } for _, suffix := range []ImageType{ ImageTypeTar, ImageTypeTarGz, diff --git a/docs/reference/hetzner-bare-metal-machine-template.md b/docs/reference/hetzner-bare-metal-machine-template.md index 94b152e57..cf131ce75 100644 --- a/docs/reference/hetzner-bare-metal-machine-template.md +++ b/docs/reference/hetzner-bare-metal-machine-template.md @@ -18,7 +18,7 @@ Updating a `HetznerBareMetalMachineTemplate` is not possible. Instead, a new tem ## cloud-init and installimage -Both in installimage and cloud-init the ports used for SSH can be changed, e.g. with the following code snippet: +Both in [installimage](https://docs.hetzner.com/robot/dedicated-server/operating-systems/installimage/) and cloud-init the ports used for SSH can be changed, e.g. with the following code snippet: ``` sed -i -e '/^\(#\|\)Port/s/^.*$/Port 2223/' /etc/ssh/sshd_config @@ -39,7 +39,7 @@ Via MatchLabels you can specify a certain label (key and value) that identifies | -------------------------------------------------------------- | ------------------- | ----------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | | template.spec.providerID | string | | no | Provider ID set by controller | | template.spec.installImage | object | | yes | Configuration used in autosetup | -| template.spec.installImage.image | object | | yes | Defines image for bm machine. Must specify either name and url, or a (local) path | +| template.spec.installImage.image | object | | yes | Defines image for bm machine. See below for details. | | template.spec.installImage.image.url | string | | no | Remote URL of image. Can be tar, tar.gz, tar.bz, tar.bz2, tar.xz, tgz, tbz, txz | | template.spec.installImage.image.name | string | | no | Name of the image | | template.spec.installImage.image.path | string | | no | Local path of a pre-installed image | @@ -75,3 +75,68 @@ Via MatchLabels you can specify a certain label (key and value) that identifies | template.spec.sshSpec.secretRef.key.privateKey | string | | yes | PrivateKey is the key in the secret's data where the SSH key's private key is stored | | template.spec.sshSpec.portAfterInstallImage | int | 22 | no | PortAfterInstallImage specifies the port that can be used to reach the server via SSH after install image completed successfully | | template.spec.sshSpec.portAfterCloudInit | int | 22 (install image port) | no | PortAfterCloudInit specifies the port that can be used to reach the server via SSH after cloud init completed successfully | + +### installImage.image + +You must specify either name and url, or a local path. + +Example of an image provided by Hetzner via NFS: + +``` +image: + path: /root/.oldroot/nfs//images/Ubuntu-2204-jammy-amd64-base.tar.gz +``` + +Example of an image provided by you via https. The script installimage of Hetzner parses the name to detect the version. It is +recommended to follow their naming pattern. + +``` +image: + name: Ubuntu-2204-jammy-amd64-custom + url: https://user:pwd@example.com/images/Ubuntu-2204-jammy-amd64-custom.tar.gz + +``` + +Example of pulling an image from an oci-registry: + +``` +image: + name: Ubuntu-2204-jammy-amd64-custom + url: oci://ghcr.io/myorg/images/Ubuntu-2204-jammy-amd64-custom:1.0.0-beta.2 +``` + +If you need credentials to pull the image, then provide the environment variable `OCI_REGISTRY_AUTH_TOKEN` to the controller. + +You can provide the variable via a secret of the deployment `caph-controller-manager`: + +``` +apiVersion: apps/v1 +kind: Deployment +metadata: + # ... +spec: + # ... + template: + spec: + containers: + - command: + - /manager + image: ghcr.io/syself/caph:vXXX + env: + - name: OCI_REGISTRY_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: my-oci-registry-secret # The name of the secret + key: OCI_REGISTRY_AUTH_TOKEN # The key in the secret to use + # ... other container specs +``` + +You can push an image to a oci-registry with a tool like [oras](https://oras.land): + +``` +oras push ghcr.io/myorg/images/Ubuntu-2204-jammy-amd64-custom:1.0.0-beta.2 \ + --artifact-type application/vnd.myorg.machine-image.v1 Ubuntu-2204-jammy-amd64-custom.tar.gz +``` + + + diff --git a/hack/filter-caph-controller-manager-logs.py b/hack/filter-caph-controller-manager-logs.py index 23aa1e789..2fcca4c8c 100755 --- a/hack/filter-caph-controller-manager-logs.py +++ b/hack/filter-caph-controller-manager-logs.py @@ -27,6 +27,8 @@ rows_to_skip = [ 'controller-runtime.webhook', 'certwatcher/certwatcher', 'Registering a validating webhook', 'Registering a mutating webhook', 'Starting EventSource', + 'Starting Controller', + '"Starting workers" controller/controller', '"Reconciling finished"', '"Creating cluster scope"', '"Starting reconciling cluster"', diff --git a/hack/output-for-watch.sh b/hack/output-for-watch.sh index df6976e87..ddc21a868 100755 --- a/hack/output-for-watch.sh +++ b/hack/output-for-watch.sh @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -function print_heading(){ +function print_heading() { blue='\033[0;34m' nc='\033[0m' # No Color echo -e "${blue}${1}${nc}" @@ -36,6 +36,10 @@ print_heading hetznerbaremetalmachine: kubectl get hetznerbaremetalmachine -A +print_heading hetznerbaremetalhost: + +kubectl get hetznerbaremetalhost -A + print_heading events: kubectl get events -A --sort-by=lastTimestamp | grep -vP 'LeaderElection' | tail -8 @@ -51,9 +55,9 @@ if [ $(kubectl get machine -l cluster.x-k8s.io/control-plane 2>/dev/null | wc -l exit 1 fi -ip=$(kubectl get machine -l cluster.x-k8s.io/control-plane -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}' | grep -oP '[0-9.]{8,}') +ip=$(kubectl get machine -l cluster.x-k8s.io/control-plane -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}' | grep -oP '[0-9.]{8,}') if [ -z "$ip" ]; then - ip=$(kubectl get machine -l cluster.x-k8s.io/control-plane -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | grep -oP '[0-9.]{8,}') + ip=$(kubectl get machine -l cluster.x-k8s.io/control-plane -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' | grep -oP '[0-9.]{8,}') if [ -z "$ip" ]; then echo "❌ Could not get IP of control-plane" fi @@ -75,7 +79,6 @@ echo kubeconfig_wl=".workload-cluster-kubeconfig.yaml" - echo "KUBECONFIG=$kubeconfig_wl kubectl cluster-info" if KUBECONFIG=$kubeconfig_wl kubectl cluster-info >/dev/null 2>&1; then echo "👌 cluster is reachable" @@ -90,7 +93,7 @@ deployment=$(KUBECONFIG=$kubeconfig_wl kubectl get -n kube-system deployment | g if [ -z "$deployment" ]; then echo "❌ ccm not installed?" else - echo "👌 ccm installed:" + echo "👌 ccm installed:" KUBECONFIG=$kubeconfig_wl kubectl get -n kube-system deployment "$deployment" yaml=$(KUBECONFIG=$kubeconfig_wl kubectl get -n kube-system deployment "$deployment" -o yaml) if [[ $yaml =~ "unavailableReplicas:" ]]; then diff --git a/pkg/services/baremetal/client/ssh/download-from-oci.sh b/pkg/services/baremetal/client/ssh/download-from-oci.sh new file mode 100755 index 000000000..72f7fc452 --- /dev/null +++ b/pkg/services/baremetal/client/ssh/download-from-oci.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This scripts gets copied from the controller into the rescue system +# of the bare-metal machine. + +set -euo pipefail + +image="${1:-}" +outfile="${2:-}" + +function usage { + echo "$0 image outfile." + echo " Download a machine image from a container registry" + echo " image: for example ghcr.io/foo/bar/my-machine-image:v9" + echo " outfile: Created file. Usualy with file extensions '.tgz'" + echo " If the oci registry needs a token, then the script uses OCI_REGISTRY_AUTH_TOKEN (if set)" + echo " Example of OCI_REGISTRY_AUTH_TOKEN: github:ghp_SN51...." + echo +} +if [ -z "$outfile" ]; then + usage + exit 1 +fi +OCI_REGISTRY_AUTH_TOKEN="${OCI_REGISTRY_AUTH_TOKEN:-}" # github:$GITHUB_TOKEN + +# Extract registry +registry="${image%%/*}" + +# Extract scope and tag +remainder="${image#*/}" +scope="${remainder%:*}" +tag="${remainder##*:}" + +if [[ -z "$registry" || -z "$scope" || -z "$tag" ]]; then + echo "failed to parse registry, scope and tag from image" + echo "image=$image" + echo "registry=$registry" + echo "scope=$scope" + echo "tag=$tag" + exit 1 +fi + +function download_with_token { + echo "download with token (OCI_REGISTRY_AUTH_TOKEN set)" + if [[ "$OCI_REGISTRY_AUTH_TOKEN" != *:* ]]; then + echo "OCI_REGISTRY_AUTH_TOKEN needs to contain a ':' (user:token)" + exit 1 + fi + + token=$(curl -fsSL -u "$OCI_REGISTRY_AUTH_TOKEN" "https://${registry}/token?scope=repository:$scope:pull" | jq -r '.token') + if [ -z "$token" ]; then + echo "Failed to get token for container registry" + exit 1 + fi + + echo "Login to $registry was successful" + + digest=$(curl -sSL -H "Authorization: Bearer $token" -H "Accept: application/vnd.oci.image.manifest.v1+json" \ + "https://${registry}/v2/${scope}/manifests/${tag}" | jq -r '.layers[0].digest') + + if [ -z "$digest" ]; then + echo "Failed to get digest from container registry" + exit 1 + fi + + echo "Start download of $image" + curl -fsSL -H "Authorization: Bearer $token" \ + "https://${registry}/v2/${scope}/blobs/$digest" >"$outfile" +} + +function download_without_token { + echo "download without token (OCI_REGISTRY_AUTH_TOKEN empty)" + digest=$(curl -sSL -H "Accept: application/vnd.oci.image.manifest.v1+json" \ + "https://${registry}/v2/${scope}/manifests/${tag}" | jq -r '.layers[0].digest') + + if [ -z "$digest" ]; then + echo "Failed to get digest from container registry" + exit 1 + fi + + echo "Start download of $image" + curl -fsSL "https://${registry}/v2/${scope}/blobs/$digest" >"$outfile" +} + +if [ -z "$OCI_REGISTRY_AUTH_TOKEN" ]; then + download_without_token +else + download_with_token +fi diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 72f63961a..84b8d6ca6 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -35,6 +35,111 @@ const ( sshTimeOut time.Duration = 5 * time.Second ) +var downloadFromOciShellScript = `#!/bin/bash + +# Copyright 2023 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This scripts gets copied from the controller into the rescue system +# of the bare-metal machine. + +set -euo pipefail + +image="${1:-}" +outfile="${2:-}" + +function usage { + echo "$0 image outfile." + echo " Download a machine image from a container registry" + echo " image: for example ghcr.io/foo/bar/my-machine-image:v9" + echo " outfile: Created file. Usually with file extensions '.tgz'" + echo " If the oci registry needs a token, then the script uses OCI_REGISTRY_AUTH_TOKEN (if set)" + echo " Example of OCI_REGISTRY_AUTH_TOKEN: github:ghp_SN51...." + echo +} +if [ -z "$outfile" ]; then + usage + exit 1 +fi +OCI_REGISTRY_AUTH_TOKEN="${OCI_REGISTRY_AUTH_TOKEN:-}" # github:$GITHUB_TOKEN + +# Extract registry +registry="${image%%/*}" + +# Extract scope and tag +remainder="${image#*/}" +scope="${remainder%:*}" +tag="${remainder##*:}" + +if [[ -z "$registry" || -z "$scope" || -z "$tag" ]]; then + echo "failed to parse registry, scope and tag from image" + echo "image=$image" + echo "registry=$registry" + echo "scope=$scope" + echo "tag=$tag" + exit 1 +fi + +function download_with_token { + echo "download with token (OCI_REGISTRY_AUTH_TOKEN set)" + if [[ "$OCI_REGISTRY_AUTH_TOKEN" != *:* ]]; then + echo "OCI_REGISTRY_AUTH_TOKEN needs to contain a ':' (user:token)" + exit 1 + fi + + token=$(curl -fsSL -u "$OCI_REGISTRY_AUTH_TOKEN" "https://${registry}/token?scope=repository:$scope:pull" | jq -r '.token') + if [ -z "$token" ]; then + echo "Failed to get token for container registry" + exit 1 + fi + + echo "Login to $registry was successful" + + digest=$(curl -sSL -H "Authorization: Bearer $token" -H "Accept: application/vnd.oci.image.manifest.v1+json" \ + "https://${registry}/v2/${scope}/manifests/${tag}" | jq -r '.layers[0].digest') + + if [ -z "$digest" ]; then + echo "Failed to get digest from container registry" + exit 1 + fi + + echo "Start download of $image" + curl -fsSL -H "Authorization: Bearer $token" \ + "https://${registry}/v2/${scope}/blobs/$digest" >"$outfile" +} + +function download_without_token { + echo "download without token (OCI_REGISTRY_AUTH_TOKEN empty)" + digest=$(curl -sSL -H "Accept: application/vnd.oci.image.manifest.v1+json" \ + "https://${registry}/v2/${scope}/manifests/${tag}" | jq -r '.layers[0].digest') + + if [ -z "$digest" ]; then + echo "Failed to get digest from container registry" + exit 1 + fi + + echo "Start download of $image" + curl -fsSL "https://${registry}/v2/${scope}/blobs/$digest" >"$outfile" +} + +if [ -z "$OCI_REGISTRY_AUTH_TOKEN" ]; then + download_without_token +else + download_with_token +fi +` + var ( // ErrCommandExitedWithoutExitSignal means the ssh command exited unplanned. ErrCommandExitedWithoutExitSignal = errors.New("wait: remote command exited without exit status or exit signal") @@ -221,7 +326,16 @@ EOF`, data)) // DownloadImage implements the DownloadImage method of the SSHClient interface. func (c *sshClient) DownloadImage(path, url string) Output { - return c.runSSH(fmt.Sprintf(`curl -sLo "%q" "%q"`, path, url)) + if !strings.HasPrefix(url, "oci://") { + return c.runSSH(fmt.Sprintf(`curl -sLo "%q" "%q"`, path, url)) + } + return c.runSSH(fmt.Sprintf(`cat << 'ENDOFSCRIPT' > /root/download-from-oci.sh +%s +ENDOFSCRIPT +chmod a+rx /root/download-from-oci.sh +OCI_REGISTRY_AUTH_TOKEN=%s /root/download-from-oci.sh %s %s`, downloadFromOciShellScript, + os.Getenv("OCI_REGISTRY_AUTH_TOKEN"), + strings.TrimPrefix(url, "oci://"), path)) } // CreatePostInstallScript implements the CreatePostInstallScript method of the SSHClient interface. diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 330936a64..4d875fc3b 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -104,7 +104,7 @@ func (s *Service) Reconcile(ctx context.Context) (result reconcile.Result, err e conditions.SetSummary(s.scope.HetznerBareMetalHost) // save host if it changed during reconciliation - if !reflect.DeepEqual(oldHost, *s.scope.HetznerBareMetalHost) { + if !reflect.DeepEqual(oldHost, s.scope.HetznerBareMetalHost) { saveResult, saveErr := SaveHostAndReturn(ctx, s.scope.Client, s.scope.HetznerBareMetalHost) emptyResult := reconcile.Result{} if result == emptyResult && err == nil { @@ -978,17 +978,22 @@ func (s *Service) actionImageInstalling() actionResult { record.Warnf(s.scope.HetznerBareMetalHost, "ExecuteInstallImageFailed", out.StdOut) return actionError{err: fmt.Errorf("failed to execute installimage: %w", out.Err)} } + record.Eventf(s.scope.HetznerBareMetalHost, "ExecuteInstallImageSucceeded", out.StdOut) + s.scope.Logger.Info("ExecuteInstallImageSucceeded", "stdout", out.StdOut, "stderr", out.StdErr) // Update name in robot API if _, err := s.scope.RobotClient.SetBMServerName(s.scope.HetznerBareMetalHost.Spec.ServerID, autoSetupInput.hostName); err != nil { + record.Warn(s.scope.HetznerBareMetalHost, "SetBMServerNameFailed", err.Error()) s.handleRobotRateLimitExceeded(err, "SetBMServerName") return actionError{err: fmt.Errorf("failed to update name of host in robot API: %w", err)} } if err := handleSSHError(sshClient.Reboot()); err != nil { + record.Warn(s.scope.HetznerBareMetalHost, "RebootFailed", err.Error()) return actionError{err: fmt.Errorf("failed to reboot server: %w", err)} } + s.scope.Logger.Info("RebootAfterInstallimageSucceeded", "stdout", out.StdOut, "stderr", out.StdErr) // clear potential errors - all done s.scope.HetznerBareMetalHost.ClearError() @@ -1011,7 +1016,7 @@ func (s *Service) createAutoSetupInput(sshClient sshclient.Client) (autoSetupInp if needsDownload { out := sshClient.DownloadImage(imagePath, image.URL) if err := handleSSHError(out); err != nil { - return autoSetupInput{}, actionError{err: fmt.Errorf("failed to download image: %w", err)} + return autoSetupInput{}, actionError{err: fmt.Errorf("failed to download image: %s %s %w", out.StdOut, out.StdErr, err)} } } @@ -1068,9 +1073,30 @@ func (s *Service) actionProvisioning() actionResult { wantHostName := infrav1.BareMetalHostNamePrefix + host.Spec.ConsumerRef.Name out := sshClient.GetHostName() + + if out.Err != nil { + msg := fmt.Sprintf("ssh to port %d failed: %s (%s): %v", portAfterInstallImage, out.StdOut, out.StdErr, out.Err) + conditions.MarkFalse(host, + infrav1.SSHAfterInstallImageSucceededCondition, + infrav1.SSHAfterInstallImageFailedReason, + clusterv1.ConditionSeverityWarning, msg) + + if time.Now().After(conditions.GetLastTransitionTime( + s.scope.HetznerBareMetalHost, + infrav1.SSHAfterInstallImageSucceededCondition).Add(7 * time.Minute)) { + // We waited some minutes. Still no connection. There seems to something wrong. + record.Warn(host, infrav1.SSHAfterInstallImageFailedReason, msg) + } + } else { + conditions.MarkTrue(s.scope.HetznerBareMetalHost, + infrav1.SSHAfterInstallImageSucceededCondition) + } + if trimLineBreak(out.StdOut) != wantHostName { - // give the reboot some time until it takes effect + // Give the reboot some time until it takes effect. Otherwise the ssh connection gets done too fast, + // and it will connect to the machine before it gets rebooted. if s.hasJustRebooted() { + s.scope.Logger.Info("hasJustRebooted is true", "LastUpdated", host.Spec.Status.LastUpdated) return actionContinue{delay: 2 * time.Second} } @@ -1558,6 +1584,10 @@ func (s *Service) handleRobotRateLimitExceeded(err error, functionName string) { } } +// hasJustRebooted returns true if a reboot was done during the last seconds. +// The method gets used to let the controller wait until the reboot was actually done. +// Imagine the controller triggers a reboot, and reconciles immediately. This would +// mean the controller would do the same reboot immediately again. func (s *Service) hasJustRebooted() bool { return (s.scope.HetznerBareMetalHost.Spec.Status.ErrorType == infrav1.ErrorTypeSSHRebootTriggered || s.scope.HetznerBareMetalHost.Spec.Status.ErrorType == infrav1.ErrorTypeSoftwareRebootTriggered ||