Skip to content

Commit

Permalink
✨ download BM images via ocr-registry.
Browse files Browse the repository at this point in the history
  • Loading branch information
guettli committed Dec 5, 2023
1 parent a12b364 commit 4ecdc8a
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ wait-and-get-secret:
$(KUBECTL) get secrets $(CLUSTER_NAME)-kubeconfig -o json | jq -r .data.value | base64 --decode > $(WORKER_CLUSTER_KUBECONFIG)
${TIMEOUT} 15m bash -c "while ! $(KUBECTL) --kubeconfig=$(WORKER_CLUSTER_KUBECONFIG) get nodes | grep control-plane; do sleep 1; done"

install-cilium-in-wl-cluster:
install-cilium-in-wl-cluster: $(HELM)
# Deploy cilium
$(HELM) repo add cilium https://helm.cilium.io/
$(HELM) repo update cilium
Expand Down
7 changes: 7 additions & 0 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,13 @@ const (
ServerNotFoundReason = "ServerNotFound"
)

const (
// SSHAfterInstallImageSucceededCondition indicates that the host is reachable via ssh after installImage.
SSHAfterInstallImageSucceededCondition clusterv1.ConditionType = "SSHAfterInstallImageSucceeded"

// SSHAfterInstallImageFailedReason indicates that the host was not reachable via ssh.
SSHAfterInstallImageFailedReason = "SSHAfterInstallImageFailed"
)
const (
// HostAssociateSucceededCondition indicates that a host has been associated.
HostAssociateSucceededCondition clusterv1.ConditionType = "HostAssociateSucceeded"
Expand Down
3 changes: 3 additions & 0 deletions api/v1beta1/hetznerbaremetalmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ func (bmMachine *HetznerBareMetalMachine) SetFailure(reason capierrors.MachineSt

// GetImageSuffix tests whether the suffix is known and outputs it if yes. Otherwise it returns an error.
func GetImageSuffix(url string) (string, error) {
if strings.HasPrefix(url, "oci://") {
return "tar.gz", nil
}
for _, suffix := range []ImageType{
ImageTypeTar,
ImageTypeTarGz,
Expand Down
5 changes: 4 additions & 1 deletion docs/reference/hetzner-bare-metal-machine-template.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Updating a `HetznerBareMetalMachineTemplate` is not possible. Instead, a new tem

## cloud-init and installimage

Both in installimage and cloud-init the ports used for SSH can be changed, e.g. with the following code snippet:
Both in [installimage](https://docs.hetzner.com/robot/dedicated-server/operating-systems/installimage/) and cloud-init the ports used for SSH can be changed, e.g. with the following code snippet:

```
sed -i -e '/^\(#\|\)Port/s/^.*$/Port 2223/' /etc/ssh/sshd_config
Expand Down Expand Up @@ -75,3 +75,6 @@ Via MatchLabels you can specify a certain label (key and value) that identifies
| template.spec.sshSpec.secretRef.key.privateKey | string | | yes | PrivateKey is the key in the secret's data where the SSH key's private key is stored |
| template.spec.sshSpec.portAfterInstallImage | int | 22 | no | PortAfterInstallImage specifies the port that can be used to reach the server via SSH after install image completed successfully |
| template.spec.sshSpec.portAfterCloudInit | int | 22 (install image port) | no | PortAfterCloudInit specifies the port that can be used to reach the server via SSH after cloud init completed successfully |

### installImage.image

2 changes: 2 additions & 0 deletions hack/filter-caph-controller-manager-logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
rows_to_skip = [
'controller-runtime.webhook', 'certwatcher/certwatcher', 'Registering a validating webhook',
'Registering a mutating webhook', 'Starting EventSource',
'Starting Controller',
'"Starting workers" controller/controller',
'"Reconciling finished"',
'"Creating cluster scope"',
'"Starting reconciling cluster"',
Expand Down
103 changes: 103 additions & 0 deletions pkg/services/baremetal/client/ssh/download-from-oci.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/bin/bash

# Copyright 2023 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This scripts gets copied from the controller into the rescue system
# of the bare-metal machine.

set -euo pipefail

image="${1:-}"
outfile="${2:-}"

function usage {
echo "$0 image outfile."
echo " Download a machine image from a container registry"
echo " image: for example ghcr.io/foo/bar/my-machine-image:v9"
echo " outfile: Created file. Usualy with file extensions '.tgz'"
echo " If the oci registry needs a token, then the script uses OCI_REGISTRY_AUTH_TOKEN (if set)"
echo " Example of OCI_REGISTRY_AUTH_TOKEN: github:ghp_SN51...."
echo
}
if [ -z "$outfile" ]; then
usage
exit 1
fi
OCI_REGISTRY_AUTH_TOKEN="${OCI_REGISTRY_AUTH_TOKEN:-}" # github:$GITHUB_TOKEN

# Extract registry
registry="${image%%/*}"

# Extract scope and tag
remainder="${image#*/}"
scope="${remainder%:*}"
tag="${remainder##*:}"

if [[ -z "$registry" || -z "$scope" || -z "$tag" ]]; then
echo "failed to parse registry, scope and tag from image"
echo "image=$image"
echo "registry=$registry"
echo "scope=$scope"
echo "tag=$tag"
exit 1
fi

function download_with_token {
echo "download with token (OCI_REGISTRY_AUTH_TOKEN set)"
if [[ "$OCI_REGISTRY_AUTH_TOKEN" != *:* ]]; then
echo "OCI_REGISTRY_AUTH_TOKEN needs to contain a ':' (user:token)"
exit 1
fi

token=$(curl -fsSL -u "$OCI_REGISTRY_AUTH_TOKEN" "https://${registry}/token?scope=repository:$scope:pull" | jq -r '.token')
if [ -z "$token" ]; then
echo "Failed to get token for container registry"
exit 1
fi

echo "Login to $registry was successful"

digest=$(curl -sSL -H "Authorization: Bearer $token" -H "Accept: application/vnd.oci.image.manifest.v1+json" \
"https://${registry}/v2/${scope}/manifests/${tag}" | jq -r '.layers[0].digest')

if [ -z "$digest" ]; then
echo "Failed to get digest from container registry"
exit 1
fi

echo "Start download of $image"
curl -fsSL -H "Authorization: Bearer $token" \
"https://${registry}/v2/${scope}/blobs/$digest" >"$outfile"
}

function download_without_token {
echo "download without token (OCI_REGISTRY_AUTH_TOKEN empty)"
digest=$(curl -sSL -H "Accept: application/vnd.oci.image.manifest.v1+json" \
"https://${registry}/v2/${scope}/manifests/${tag}" | jq -r '.layers[0].digest')

if [ -z "$digest" ]; then
echo "Failed to get digest from container registry"
exit 1
fi

echo "Start download of $image"
curl -fsSL "https://${registry}/v2/${scope}/blobs/$digest" >"$outfile"
}

if [ -z "$OCI_REGISTRY_AUTH_TOKEN" ]; then
download_without_token
else
download_with_token
fi
16 changes: 15 additions & 1 deletion pkg/services/baremetal/client/ssh/ssh_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ limitations under the License.
package sshclient

import (
_ "embed"

Check failure on line 21 in pkg/services/baremetal/client/ssh/ssh_client.go

View workflow job for this annotation

GitHub Actions / Lint Pull Request

File is not `gci`-ed with --skip-generated -s standard -s default -s prefix(github.com/syself/cluster-api-provider-hetzner) (gci)

"bufio"
"bytes"

Check failure on line 24 in pkg/services/baremetal/client/ssh/ssh_client.go

View workflow job for this annotation

GitHub Actions / Lint Pull Request

File is not `gci`-ed with --skip-generated -s standard -s default -s prefix(github.com/syself/cluster-api-provider-hetzner) (gci)
"encoding/base64"
Expand All @@ -35,6 +37,9 @@ const (
sshTimeOut time.Duration = 5 * time.Second
)

//go:embed download-from-oci.sh
var downloadFromOciShellScript string

var (
// ErrCommandExitedWithoutExitSignal means the ssh command exited unplanned.
ErrCommandExitedWithoutExitSignal = errors.New("wait: remote command exited without exit status or exit signal")
Expand Down Expand Up @@ -221,7 +226,16 @@ EOF`, data))

// DownloadImage implements the DownloadImage method of the SSHClient interface.
func (c *sshClient) DownloadImage(path, url string) Output {
return c.runSSH(fmt.Sprintf(`curl -sLo "%q" "%q"`, path, url))
if !strings.HasPrefix(url, "oci://") {
return c.runSSH(fmt.Sprintf(`curl -sLo "%q" "%q"`, path, url))
}
return c.runSSH(fmt.Sprintf(`cat << 'ENDOFSCRIPT' > /root/download-from-oci.sh
%s
ENDOFSCRIPT
chmod a+rx /root/download-from-oci.sh
OCI_REGISTRY_AUTH_TOKEN=%s /root/download-from-oci.sh %s %s`, downloadFromOciShellScript,
os.Getenv("OCI_REGISTRY_AUTH_TOKEN"),
strings.TrimPrefix(url, "oci://"), path))
}

// CreatePostInstallScript implements the CreatePostInstallScript method of the SSHClient interface.
Expand Down
25 changes: 24 additions & 1 deletion pkg/services/baremetal/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -1011,7 +1011,15 @@ func (s *Service) createAutoSetupInput(sshClient sshclient.Client) (autoSetupInp
if needsDownload {
out := sshClient.DownloadImage(imagePath, image.URL)
if err := handleSSHError(out); err != nil {
return autoSetupInput{}, actionError{err: fmt.Errorf("failed to download image: %w", err)}
// TODO: this could fail like this, if the registry requires auth, but not token is given.
// TODO: This should be visible in the Conditions somehow.
// 15:40:37 ERROR "Reconciler error" controller/controller.go:324 {'HetznerBareMetalHost': ..
// 'error': 'failed to reconcile HetznerBareMetalHost default/test-bm-gpu: action "image-installing"
// failed: failed to download image: \ndownload without token (OCI_REGISTRY_AUTH_TOKEN)\n
// Start download of
// ghcr.io/syself/autopilot/node-images/staging/hetzner-apalla-1-27-workeramd64baremetal:v9-beta-1\n
// curl: (22) The requested URL returned error: 404\n failed to perform ssh command: Process exited with status 22'}
return autoSetupInput{}, actionError{err: fmt.Errorf("failed to download image: %s %s %w", out.StdOut, out.StdErr, err)}
}
}

Expand Down Expand Up @@ -1073,6 +1081,18 @@ func (s *Service) actionProvisioning() actionResult {
if s.hasJustRebooted() {
return actionContinue{delay: 2 * time.Second}
}
msg := fmt.Sprintf("ssh to port %d failed: %s (%s): %v", portAfterInstallImage, out.StdOut, out.StdErr, out.Err)
conditions.MarkFalse(host,
infrav1.SSHAfterInstallImageSucceededCondition,
infrav1.SSHAfterInstallImageFailedReason,
clusterv1.ConditionSeverityWarning, msg)

if time.Now().After(conditions.GetLastTransitionTime(
s.scope.HetznerBareMetalHost,
infrav1.SSHAfterInstallImageSucceededCondition).Add(4 * time.Minute)) {
// We waited some minutes. Still no connection. There seems to something wrong.
record.Warn(host, infrav1.SSHAfterInstallImageFailedReason, msg)
}

privateKeyRescue := sshclient.CredentialsFromSecret(s.scope.RescueSSHSecret, s.scope.HetznerCluster.Spec.SSHKeys.RobotRescueSecretRef).PrivateKey
rescueSSHClient := s.scope.SSHClientFactory.NewClient(sshclient.Input{
Expand All @@ -1099,6 +1119,9 @@ func (s *Service) actionProvisioning() actionResult {
return actionContinue{delay: 10 * time.Second}
}

conditions.MarkTrue(s.scope.HetznerBareMetalHost,
infrav1.SSHAfterInstallImageSucceededCondition)

// we are in correct boot and can start provisioning
if failedAction := s.provision(sshClient, host.Spec.ConsumerRef.Name); failedAction != nil {
return failedAction
Expand Down

0 comments on commit 4ecdc8a

Please sign in to comment.