diff --git a/.github/workflows/build-and-push.yml b/.github/workflows/build-and-push.yml index b6c37a71..bd781fc9 100644 --- a/.github/workflows/build-and-push.yml +++ b/.github/workflows/build-and-push.yml @@ -44,6 +44,9 @@ jobs: docker buildx build \ --provenance=false \ --platform linux/amd64,linux/arm64 \ + --label "org.opencontainers.image.source=https://github.com/cloudbase/garm/tree/${{ github.event.inputs.ref }}" \ + --label "org.opencontainers.image.description=GARM ${{ github.event.inputs.ref }}" \ + --label "org.opencontainers.image.licenses=Apache 2.0" \ --build-arg="GARM_REF=${{ github.event.inputs.ref }}" \ -t ${{ github.event.inputs.push_to_project }}/garm:"${VERSION}" \ --push . \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 40c2cc8e..b333c502 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,40 +8,42 @@ RUN git config --global --add safe.directory /build ADD . /build/garm RUN cd /build/garm && git checkout ${GARM_REF} -RUN git clone https://github.com/cloudbase/garm-provider-azure /build/garm-provider-azure -RUN git clone https://github.com/cloudbase/garm-provider-openstack /build/garm-provider-openstack -RUN git clone https://github.com/cloudbase/garm-provider-lxd /build/garm-provider-lxd -RUN git clone https://github.com/cloudbase/garm-provider-incus /build/garm-provider-incus -RUN git clone https://github.com/mercedes-benz/garm-provider-k8s /build/garm-provider-k8s -RUN git clone https://github.com/cloudbase/garm-provider-aws /build/garm-provider-aws -RUN git clone https://github.com/cloudbase/garm-provider-gcp /build/garm-provider-gcp -RUN git clone https://github.com/cloudbase/garm-provider-equinix /build/garm-provider-equinix +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-azure /build/garm-provider-azure +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-openstack /build/garm-provider-openstack +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-lxd /build/garm-provider-lxd +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-incus /build/garm-provider-incus +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-aws /build/garm-provider-aws +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-gcp /build/garm-provider-gcp +RUN git clone --depth 1 --branch v0.1.0 https://github.com/cloudbase/garm-provider-equinix /build/garm-provider-equinix + +RUN git clone --depth 1 --branch v0.3.1 https://github.com/mercedes-benz/garm-provider-k8s /build/garm-provider-k8s RUN cd /build/garm && go build -o /bin/garm \ -tags osusergo,netgo,sqlite_omit_load_extension \ -ldflags "-linkmode external -extldflags '-static' -s -w -X github.com/cloudbase/garm/util/appdefaults.Version=$(git describe --tags --match='v[0-9]*' --dirty --always)" \ /build/garm/cmd/garm && upx /bin/garm RUN mkdir -p /opt/garm/providers.d -RUN cd /build/garm-provider-azure && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-azure . && upx /opt/garm/providers.d/garm-provider-azure -RUN cd /build/garm-provider-openstack && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-openstack . && upx /opt/garm/providers.d/garm-provider-openstack -RUN cd /build/garm-provider-lxd && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-lxd . && upx /opt/garm/providers.d/garm-provider-lxd -RUN cd /build/garm-provider-incus && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-incus . && upx /opt/garm/providers.d/garm-provider-incus +RUN cd /build/garm-provider-azure && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-azure . && upx /opt/garm/providers.d/garm-provider-azure +RUN cd /build/garm-provider-openstack && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-openstack . && upx /opt/garm/providers.d/garm-provider-openstack +RUN cd /build/garm-provider-lxd && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-lxd . && upx /opt/garm/providers.d/garm-provider-lxd +RUN cd /build/garm-provider-incus && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-incus . && upx /opt/garm/providers.d/garm-provider-incus +RUN cd /build/garm-provider-aws && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-aws . && upx /opt/garm/providers.d/garm-provider-aws +RUN cd /build/garm-provider-gcp && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-gcp . && upx /opt/garm/providers.d/garm-provider-gcp +RUN cd /build/garm-provider-equinix && go build -ldflags="-linkmode external -extldflags '-static' -s -w -X main.Version=v0.1.0" -o /opt/garm/providers.d/garm-provider-equinix . && upx /opt/garm/providers.d/garm-provider-equinix + RUN cd /build/garm-provider-k8s/cmd/garm-provider-k8s && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-k8s . && upx /opt/garm/providers.d/garm-provider-k8s -RUN cd /build/garm-provider-aws && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-aws . && upx /opt/garm/providers.d/garm-provider-aws -RUN cd /build/garm-provider-gcp && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-gcp . && upx /opt/garm/providers.d/garm-provider-gcp -RUN cd /build/garm-provider-equinix && go build -ldflags="-linkmode external -extldflags '-static' -s -w" -o /opt/garm/providers.d/garm-provider-equinix . && upx /opt/garm/providers.d/garm-provider-equinix -FROM scratch +FROM busybox COPY --from=builder /bin/garm /bin/garm COPY --from=builder /opt/garm/providers.d/garm-provider-openstack /opt/garm/providers.d/garm-provider-openstack COPY --from=builder /opt/garm/providers.d/garm-provider-lxd /opt/garm/providers.d/garm-provider-lxd COPY --from=builder /opt/garm/providers.d/garm-provider-incus /opt/garm/providers.d/garm-provider-incus -COPY --from=builder /opt/garm/providers.d/garm-provider-k8s /opt/garm/providers.d/garm-provider-k8s COPY --from=builder /opt/garm/providers.d/garm-provider-azure /opt/garm/providers.d/garm-provider-azure COPY --from=builder /opt/garm/providers.d/garm-provider-aws /opt/garm/providers.d/garm-provider-aws COPY --from=builder /opt/garm/providers.d/garm-provider-gcp /opt/garm/providers.d/garm-provider-gcp COPY --from=builder /opt/garm/providers.d/garm-provider-equinix /opt/garm/providers.d/garm-provider-equinix -COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ + +COPY --from=builder /opt/garm/providers.d/garm-provider-k8s /opt/garm/providers.d/garm-provider-k8s ENTRYPOINT ["/bin/garm", "-config", "/etc/garm/config.toml"] diff --git a/README.md b/README.md index bd08f034..4411834c 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,39 @@ [![Go Tests](https://github.com/cloudbase/garm/actions/workflows/go-tests.yml/badge.svg)](https://github.com/cloudbase/garm/actions/workflows/go-tests.yml) + + +- [About GARM](#about-garm) +- [Join us on slack](#join-us-on-slack) +- [Installing](#installing) + - [Quickstart](#quickstart) + - [Installing on Kubernetes](#installing-on-kubernetes) +- [Using GARM](#using-garm) +- [Supported providers](#supported-providers) + - [Installing external providers](#installing-external-providers) +- [Optimizing your runners](#optimizing-your-runners) +- [Write your own provider](#write-your-own-provider) + + + +## About GARM + Welcome to GARM! GARM enables you to create and automatically maintain pools of [self-hosted GitHub runners](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners), with auto-scaling that can be used inside your github workflow runs. -The goal of ```GARM``` is to be simple to set up, simple to configure and simple to use. It is a single binary that can run on any GNU/Linux machine without any other requirements other than the providers it creates the runners in. It is intended to be easy to deploy in any environment and can create runners in any system you can write a provider for. There is no complicated setup process and no extremely complex concepts to understand. Once set up, it's meant to stay out of your way. +The goal of ```GARM``` is to be simple to set up, simple to configure and simple to use. The server itself is a single binary that can run on any GNU/Linux machine without any other requirements other than the providers you want to enable in your setup. It is intended to be easy to deploy in any environment and can create runners in virtually any system you can write a provider for. There is no complicated setup process and no extremely complex concepts to understand. Once set up, it's meant to stay out of your way. + +GARM supports creating pools in either GitHub itself or in your own deployment of [GitHub Enterprise Server](https://docs.github.com/en/enterprise-server@3.10/admin/overview/about-github-enterprise-server). For instructions on how to use ```GARM``` with GHE, see the [credentials](/doc/github_credentials.md) section of the documentation. -GARM supports creating pools on either GitHub itself or on your own deployment of [GitHub Enterprise Server](https://docs.github.com/en/enterprise-server@3.5/admin/overview/about-github-enterprise-server). For instructions on how to use ```GARM``` with GHE, see the [credentials](/doc/github_credentials.md) section of the documentation. +Through the use of providers, `GARM` can create runners in a variety of environments using the same `GARM` instance. Whether you want to create pools of runners in your OpenStack cloud, your Azure cloud or your Kubernetes cluster, that is easily achieved by just installing the appropriate providers, configuring them in `GARM` and creating pools that use them. You can create zero-runner pools for instances with high costs (large VMs, GPU enabled instances, etc) and have them spin up on demand, or you can create large pools of eagerly created k8s backed runners that can be used for your CI/CD pipelines at a moment's notice. You can mix them up and create pools in any combination of providers or resource allocations you want. -Through the use of providers, `GARM` can create runners in a variety of environments using the same `GARM` instance. Whether you want to create pools of runners in your OpenStack cloud, your Azure cloud and your Kubernetes cluster, that is easily achieved by just installing the appropriate providers, configuring them in `GARM` and creating pools that use them. You can create zero-runner pools for instances with high costs (large VMs, GPU enabled instances, etc) and have them spin up on demand, or you can create large pools of k8s backed runners that can be used for your CI/CD pipelines at a moment's notice. You can mix them up and create pools in any combination of providers or resource allocations you want. +Here is a brief architectural diagram of how GARM reacts to workflows triggered in GitHub (click the image to see a larger version): -:warning: **Important note**: The README and documentation in the `main` branch are relevant to the not yet released code that is present in `main`. Following the documentation from the `main` branch for a stable release of GARM, may lead to errors. To view the documentation for the latest stable release, please switch to the appropriate tag. For information about setting up `v0.1.4`, please refer to the [v0.1.4 tag](https://github.com/cloudbase/garm/tree/v0.1.4). +![GARM architecture diagram](/doc/images/garm-light.drawio.svg?raw=true#gh-light-mode-only) +![GARM architecture diagram](/doc/images/garm-dark.drawio.svg?raw=true#gh-dark-mode-only) + +:warning: **Important note**: The README and documentation in the `main` branch are relevant to the not yet released code that is present in `main`. Following the documentation from the `main` branch for a stable release of GARM, may lead to errors. To view the documentation for the latest stable release, please switch to the appropriate tag. For information about setting up `v0.1.5`, please refer to the [v0.1.5 tag](https://github.com/cloudbase/garm/tree/v0.1.5). ## Join us on slack @@ -22,29 +44,17 @@ Whether you're running into issues or just want to drop by and say "hi", feel fr ## Installing -### On virtual or physical machines +### Quickstart Check out the [quickstart](/doc/quickstart.md) document for instructions on how to install ```GARM```. If you'd like to build from source, check out the [building from source](/doc/building_from_source.md) document. -### On Kubernetes - -Thanks to the efforts of the amazing folks at @mercedes-benz, GARM can now be integrated into k8s via their operator. Check out the [GARM operator](https://github.com/mercedes-benz/garm-operator/) for more details. - -## Configuration - -The ```GARM``` configuration is a simple ```toml```. The sample config file in [the testdata folder](/testdata/config.toml) is fairly well commented and should be enough to get you started. The configuration file is split into several sections, each of which is documented in its own page. The sections are: +### Installing on Kubernetes -* [The default section](/doc/config_default.md) -* [Logging](/doc/config_logging.md) -* [Database](/doc/database.md) -* [Providers](/doc/providers.md) -* [Metrics](/doc/config_metrics.md) -* [JWT authentication](/doc/config_jwt_auth.md) -* [API server](/doc/config_api_server.md) +Thanks to the efforts of the amazing folks at [@mercedes-benz](https://github.com/mercedes-benz/), GARM can now be integrated into k8s via their operator. Check out the [GARM operator](https://github.com/mercedes-benz/garm-operator/) for more details. ## Using GARM -GARM is designed with simplicity in mind. At least we try to keep it as simple as possible. We're aware that adding a new tool in your workflow can be painful, especially when you already have to deal with so many. The cognitive load for OPS has reached a level where it feels overwhelming at times to even wrap your head around a new tool. As such, we believe that tools should be simple, should take no more than a few hours to understand and set up and if you absolutely need to interact with the tool, it should be as intuitive as possible. +GARM is designed with simplicity in mind. At least we try to keep it as simple as possible. We're aware that adding a new tool in your workflow can be painful, especially when you already have to deal with so many. The cognitive load for OPS has reached a level where it feels overwhelming at times to even wrap your head around a new tool. As such, we believe that tools should be simple, should take no more than a few hours to understand and set up and if you absolutely need to interact with the tool, it should be as intuitive as possible. Although we try our best to make this happen, we're aware that GARM has some rough edges, especially for new users. If you encounter issues or feel like the setup process was too complicated, please let us know. We're always looking to improve the user experience. We've written a short introduction into some of the commands that GARM has and some of the concepts involved in setting up GARM, managing runners and how GitHub does some of the things it does. diff --git a/apiserver/events/params.go b/apiserver/events/params.go index a5760d7f..274d3f1e 100644 --- a/apiserver/events/params.go +++ b/apiserver/events/params.go @@ -5,8 +5,8 @@ import ( ) type Filter struct { - Operations []common.OperationType `json:"operations"` - EntityType common.DatabaseEntityType `json:"entity-type"` + Operations []common.OperationType `json:"operations,omitempty" jsonschema:"title=operations,description=A list of operations to filter on,enum=create,enum=update,enum=delete"` + EntityType common.DatabaseEntityType `json:"entity-type,omitempty" jsonschema:"title=entity type,description=The type of entity to filter on,enum=repository,enum=organization,enum=enterprise,enum=pool,enum=user,enum=instance,enum=job,enum=controller,enum=github_credentials,enum=github_endpoint"` } func (f Filter) Validate() error { @@ -30,8 +30,8 @@ func (f Filter) Validate() error { } type Options struct { - SendEverything bool `json:"send-everything"` - Filters []Filter `json:"filters"` + SendEverything bool `json:"send-everything,omitempty" jsonschema:"title=send everything, description=send all events,default=false"` + Filters []Filter `json:"filters,omitempty" jsonschema:"title=filters,description=A list of filters to apply to the events. This is ignored when send-everything is true"` } func (o Options) Validate() error { diff --git a/contrib/providers.d/azure/cloudconfig/install_runner.tpl b/contrib/providers.d/azure/cloudconfig/install_runner.tpl deleted file mode 100644 index 910d8eac..00000000 --- a/contrib/providers.d/azure/cloudconfig/install_runner.tpl +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -set -e -set -o pipefail - -METADATA_URL="GARM_METADATA_URL" -CALLBACK_URL="GARM_CALLBACK_URL" -BEARER_TOKEN="GARM_CALLBACK_TOKEN" -DOWNLOAD_URL="GH_DOWNLOAD_URL" -DOWNLOAD_TOKEN="GH_TEMP_DOWNLOAD_TOKEN" -FILENAME="GH_FILENAME" -TARGET_URL="GH_TARGET_URL" -RUNNER_NAME="GH_RUNNER_NAME" -RUNNER_LABELS="GH_RUNNER_LABELS" -TEMP_TOKEN="" - - -if [ -z "$METADATA_URL" ];then - echo "no token is available and METADATA_URL is not set" - exit 1 -fi - -function call() { - PAYLOAD="$1" - curl --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)" -} - -function sendStatus() { - MSG="$1" - call "{\"status\": \"installing\", \"message\": \"$MSG\"}" -} - -function success() { - MSG="$1" - ID=$2 - call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}" -} - -function fail() { - MSG="$1" - call "{\"status\": \"failed\", \"message\": \"$MSG\"}" - exit 1 -} - -if [ ! -z "$DOWNLOAD_TOKEN" ]; then - TEMP_TOKEN="Authorization: Bearer $DOWNLOAD_TOKEN" -fi - -sendStatus "downloading tools from ${DOWNLOAD_URL}" -curl --fail -L -H "${TEMP_TOKEN}" -o "/home/runner/${FILENAME}" "${DOWNLOAD_URL}" || fail "failed to download tools" - -mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder" - -sendStatus "extracting runner" -tar xf "/home/runner/${FILENAME}" -C /home/runner/actions-runner/ || fail "failed to extract runner" -chown runner:runner -R /home/runner/actions-runner/ || fail "failed to change owner" - -sendStatus "installing dependencies" -cd /home/runner/actions-runner -sudo ./bin/installdependencies.sh || fail "failed to install dependencies" - -sendStatus "fetching runner registration token" -GITHUB_TOKEN=$(curl --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}" || fail "failed to get runner registration token") - -sendStatus "configuring runner" -sudo -u runner -- ./config.sh --unattended --url "${TARGET_URL}" --token "${GITHUB_TOKEN}" --name "${RUNNER_NAME}" --labels "${RUNNER_LABELS}" --ephemeral || fail "failed to configure runner" - -sendStatus "installing runner service" -./svc.sh install runner || fail "failed to install service" - -sendStatus "starting service" -./svc.sh start || fail "failed to start service" - -set +e -AGENT_ID=$(grep "agentId" /home/runner/actions-runner/.runner | tr -d -c 0-9) -if [ $? -ne 0 ];then - fail "failed to get agent ID" -fi -set -e - -success "runner successfully installed" $AGENT_ID \ No newline at end of file diff --git a/contrib/providers.d/azure/cloudconfig/userdata.tpl b/contrib/providers.d/azure/cloudconfig/userdata.tpl deleted file mode 100644 index 10ef2b51..00000000 --- a/contrib/providers.d/azure/cloudconfig/userdata.tpl +++ /dev/null @@ -1,31 +0,0 @@ -#cloud-config -package_upgrade: true -packages: - - curl - - tar -system_info: - default_user: - name: runner - home: /home/runner - shell: /bin/bash - groups: - - sudo - - adm - - cdrom - - dialout - - dip - - video - - plugdev - - netdev - - docker - - lxd - sudo: ALL=(ALL) NOPASSWD:ALL -runcmd: - - /install_runner.sh - - rm -f /install_runner.sh -write_files: - - encoding: b64 - content: RUNNER_INSTALL_B64 - owner: root:root - path: /install_runner.sh - permissions: "755" diff --git a/contrib/providers.d/azure/config.sh b/contrib/providers.d/azure/config.sh deleted file mode 100644 index f99f42ac..00000000 --- a/contrib/providers.d/azure/config.sh +++ /dev/null @@ -1,8 +0,0 @@ -# Azure service principal credentials -export AZURE_SUBSCRIPTION_ID="" -export AZURE_TENANT_ID="" -export AZURE_CLIENT_ID="" -export AZURE_CLIENT_SECRET="" - -# GARM config -export LOCATION="westeurope" diff --git a/contrib/providers.d/azure/garm-external-provider b/contrib/providers.d/azure/garm-external-provider deleted file mode 100755 index 7974f40f..00000000 --- a/contrib/providers.d/azure/garm-external-provider +++ /dev/null @@ -1,370 +0,0 @@ -#!/bin/bash - -set -e -set -o pipefail - -if [ ! -t 0 ] -then - INPUT=$(cat -) -fi -MYPATH=$(realpath ${BASH_SOURCE[0]}) -MYDIR=$(dirname "${MYPATH}") -TEMPLATES="$MYDIR/cloudconfig" - -# Defaults -LOCATION=${LOCATION:"westeurope"} - -# END Defaults - -if [ -z "$GARM_PROVIDER_CONFIG_FILE" ] -then - echo "no config file specified in env" - exit 1 -fi - -source "$GARM_PROVIDER_CONFIG_FILE" - -declare -A GARM_TO_GH_ARCH_MAP -GARM_TO_GH_ARCH_MAP["amd64"]="x64" -GARM_TO_GH_ARCH_MAP["arm"]="arm" -GARM_TO_GH_ARCH_MAP["arm64"]="arm64" - -declare -A AZURE_OS_TO_GH_OS_MAP -AZURE_OS_TO_GH_OS_MAP["Linux"]="linux" -AZURE_OS_TO_GH_OS_MAP["Windows"]="win" - -# https://docs.microsoft.com/en-us/azure/virtual-machines/states-billing#power-states-and-billing -declare -A AZURE_POWER_STATE_MAP -AZURE_POWER_STATE_MAP["VM starting"]="pending_create" -AZURE_POWER_STATE_MAP["VM running"]="running" -AZURE_POWER_STATE_MAP["VM stopping"]="stopped" -AZURE_POWER_STATE_MAP["VM stopped"]="stopped" -AZURE_POWER_STATE_MAP["VM deallocating"]="stopped" -AZURE_POWER_STATE_MAP["VM deallocated"]="stopped" - -# https://docs.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states -declare -A AZURE_PROVISION_STATE_MAP -AZURE_PROVISION_STATE_MAP["Creating"]="pending_create" -AZURE_PROVISION_STATE_MAP["Updating"]="pending_create" -AZURE_PROVISION_STATE_MAP["Migrating"]="pending_create" -AZURE_PROVISION_STATE_MAP["Failed"]="error" -AZURE_PROVISION_STATE_MAP["Succeeded"]="running" -AZURE_PROVISION_STATE_MAP["Deleting"]="pending_delete" - -function checkValNotNull() { - if [ -z "$1" -o "$1" == "null" ]; then - echo "failed to fetch value $2" - return 1 - fi - return 0 -} - -function requestedArch() { - ARCH=$(echo "$INPUT" | jq -c -r '.arch') - checkValNotNull "${ARCH}" "arch" || return $? - echo "${ARCH}" -} - -function downloadURL() { - [ -z "$1" -o -z "$2" ] && return 1 - GH_OS="${AZURE_OS_TO_GH_OS_MAP[$1]}" - GH_ARCH="${GARM_TO_GH_ARCH_MAP[$2]}" - URL=$(echo "$INPUT" | jq -c -r --arg OS "$GH_OS" --arg ARCH "$GH_ARCH" '(.tools[] | select( .os == $OS and .architecture == $ARCH)).download_url') - checkValNotNull "${URL}" "download URL" || return $? - echo "${URL}" -} - -function tempDownloadToken() { - [ -z "$1" -o -z "$2" ] && return 1 - GH_ARCH="${GARM_TO_GH_ARCH_MAP[$2]}" - TOKEN=$(echo "$INPUT" | jq -c -r --arg OS "$1" --arg ARCH "$GH_ARCH" '(.tools[] | select( .os == $OS and .architecture == $ARCH)).temp_download_token') - echo "${TOKEN}" -} - -function runnerTokenURL() { - METADATA_URL=$(echo "$INPUT" | jq -c -r '."metadata-url"') - checkValNotNull "${METADATA_URL}" "metadata-url" || return $? - echo "${METADATA_URL}/runner-registration-token/" -} - -function downloadFilename() { - [ -z "$1" -o -z "$2" ] && return 1 - GH_OS="${AZURE_OS_TO_GH_OS_MAP[$1]}" - GH_ARCH="${GARM_TO_GH_ARCH_MAP[$2]}" - FN=$(echo "$INPUT" | jq -c -r --arg OS "$GH_OS" --arg ARCH "$GH_ARCH" '(.tools[] | select( .os == $OS and .architecture == $ARCH)).filename') - checkValNotNull "${FN}" "download filename" || return $? - echo "${FN}" -} - -function poolID() { - POOL_ID=$(echo "$INPUT" | jq -c -r '.pool_id') - checkValNotNull "${POOL_ID}" "pool_id" || return $? - echo "${POOL_ID}" -} - -function vmSize() { - VM_SIZE=$(echo "$INPUT" | jq -c -r '.flavor') - checkValNotNull "${VM_SIZE}" "flavor" || return $? - echo "${VM_SIZE}" -} - -function imageUrn() { - IMG=$(echo "$INPUT" | jq -c -r '.image') - checkValNotNull "${IMG}" "image" || return $? - echo "${IMG}" -} - -function getOSImageDetails() { - IMAGE=$(echo "$INPUT" | jq -r -c '.image') - IMAGE_DETAILS=$(az vm image show --urn "$IMAGE" -o json --only-show-errors) - echo "$IMAGE_DETAILS" -} - -function repoURL() { - REPO=$(echo "$INPUT" | jq -c -r '.repo_url') - checkValNotNull "${REPO}" "repo_url" || return $? - echo "${REPO}" -} - -function callbackURL() { - CB_URL=$(echo "$INPUT" | jq -c -r '."callback-url"') - checkValNotNull "${CB_URL}" "callback-url" || return $? - echo "${CB_URL}" -} - -function callbackToken() { - CB_TK=$(echo "$INPUT" | jq -c -r '."instance-token"') - checkValNotNull "${CB_TK}" "instance-token" || return $? - echo "${CB_TK}" -} - -function instanceName() { - NAME=$(echo "$INPUT" | jq -c -r '.name') - checkValNotNull "${NAME}" "name" || return $? - echo "${NAME}" -} - -function labels() { - LBL=$(echo "$INPUT" | jq -c -r '.labels | join(",")') - checkValNotNull "${LBL}" "labels" || return $? - echo "${LBL}" -} - -function vmStatus() { - [ -z "$1" -o -z "$2" ] && return 1 - - RG_DETAILS=$(az group show -n "$1" -o json --only-show-errors) - RG_STATE=$(echo "$RG_DETAILS" | jq -r '.properties.provisioningState') - STATUS="${AZURE_PROVISION_STATE_MAP[$RG_STATE]}" - if [[ "$STATUS" != "running" ]]; then - echo "$STATUS" - return 0 - fi - VM_DETAILS=$(az vm show -g "$1" -n "$2" --show-details -o json --only-show-errors) - VM_STATE=$(echo "$VM_DETAILS" | jq -r '.provisioningState') - STATUS="${AZURE_PROVISION_STATE_MAP[$VM_STATE]}" - if [[ "$STATUS" != "running" ]]; then - echo "$STATUS" - return 0 - fi - VM_POWER_STATE=$(echo "$VM_DETAILS" | jq -r '.powerState') - VM_STATUS="${AZURE_POWER_STATE_MAP[$VM_POWER_STATE]}" - if [[ -z "${VM_STATUS}" ]]; then - echo "unknown" - return 0 - fi - echo "${VM_STATUS}" -} - -function getCloudConfig() { - IMAGE_DETAILS=$(getOSImageDetails) - - OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.osDiskImage.operatingSystem') - checkValNotNull "${OS_TYPE}" "operatingSystem" || return $? - - ARCH=$(requestedArch) - DW_URL=$(downloadURL "${OS_TYPE}" "${ARCH}") - DW_TOKEN=$(tempDownloadToken "${OS_TYPE}" "${ARCH}") - DW_FILENAME=$(downloadFilename "${OS_TYPE}" "${ARCH}") - LABELS=$(labels) - - TMP_SCRIPT=$(mktemp) - TMP_CC=$(mktemp) - - INSTALL_TPL=$(cat "${TEMPLATES}/install_runner.tpl") - CC_TPL=$(cat "${TEMPLATES}/userdata.tpl") - echo "$INSTALL_TPL" | sed -e "s|GARM_CALLBACK_URL|$(callbackURL)|g" \ - -e "s|GARM_CALLBACK_TOKEN|$(callbackToken)|g" \ - -e "s|GH_DOWNLOAD_URL|${DW_URL}|g" \ - -e "s|GH_FILENAME|${DW_FILENAME}|g" \ - -e "s|GH_TARGET_URL|$(repoURL)|g" \ - -e "s|GARM_METADATA_URL|$(runnerTokenURL)|g" \ - -e "s|GH_RUNNER_NAME|$(instanceName)|g" \ - -e "s|GH_TEMP_DOWNLOAD_TOKEN|${DW_TOKEN}|g" \ - -e "s|GH_RUNNER_LABELS|${LABELS}|g" > ${TMP_SCRIPT} - - AS_B64=$(base64 -w0 ${TMP_SCRIPT}) - echo "${CC_TPL}" | sed "s|RUNNER_INSTALL_B64|${AS_B64}|g" > ${TMP_CC} - echo "${TMP_CC}" -} - -function CreateInstance() { - if [ -z "$INPUT" ]; then - echo "expected build params in stdin" - exit 1 - fi - - CC_FILE=$(getCloudConfig) - VM_SIZE=$(vmSize) - INSTANCE_NAME=$(instanceName) - IMAGE_URN=$(imageUrn) - IMAGE_DETAILS=$(getOSImageDetails) - - OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.osDiskImage.operatingSystem' | tr '[:upper:]' '[:lower:]') - checkValNotNull "${OS_TYPE}" "os_type" || return $? - OS_NAME=$(echo "${IMAGE_URN}" | cut -d ':' -f2) - OS_VERSION=$(echo "${IMAGE_URN}" | cut -d ':' -f3) - ARCH="amd64" - - TAGS="garm_controller_id=${GARM_CONTROLLER_ID} garm_pool_id=${GARM_POOL_ID} os_type=${OS_TYPE} os_name=${OS_NAME} os_version=${OS_VERSION} os_arch=${ARCH}" - - set +e - - az group create -n $INSTANCE_NAME -l $LOCATION --tags $TAGS --only-show-errors -o none - az vm create -g $INSTANCE_NAME -n $INSTANCE_NAME -l $LOCATION --size $VM_SIZE --image $IMAGE_URN --tags $TAGS --nsg-rule none --public-ip-address "" --user-data "${CC_FILE}" -o none --only-show-errors - if [[ $? -ne 0 ]]; then - az group delete -n $INSTANCE_NAME --no-wait --y -o none --only-show-errors - echo "Failed to create Azure VM" - exit 1 - fi - rm -f "${CC_FILE}" - - set -e - - STATUS=$(vmStatus $INSTANCE_NAME $INSTANCE_NAME) - FAULT_VAL="" - - jq -rnc \ - --arg PROVIDER_ID "${INSTANCE_NAME}" \ - --arg NAME "${INSTANCE_NAME}" \ - --arg OS_TYPE "${OS_TYPE}" \ - --arg OS_NAME "${OS_NAME}" \ - --arg OS_VERSION "${OS_VERSION}" \ - --arg ARCH "${ARCH}" \ - --arg STATUS "${STATUS}" \ - --arg POOL_ID "${GARM_POOL_ID}" \ - --arg FAULT "${FAULT_VAL}" \ - '{"provider_id": $PROVIDER_ID, "name": $NAME, "os_type": $OS_TYPE, "os_name": $OS_NAME, "os_version": $OS_VERSION, "os_arch": $ARCH, "status": $STATUS, "pool_id": $POOL_ID, "provider_fault": $FAULT}' -} - -function DeleteInstance() { - local instance_id="${GARM_INSTANCE_ID}" - if [ -z "${instance_id}" ]; then - echo "missing instance ID in env" - return 1 - fi - - set +e - rg_info=$(az group show -n "${instance_id}" -o json --only-show-errors 2>&1) - if [ $? -ne 0 ]; then - CODE=$? - set -e - if echo "${rg_info}" | grep -q "ResourceGroupNotFound"; then - return 0 - fi - return $CODE - fi - set -e - az group delete -n "${instance_id}" --no-wait --y --only-show-errors -} - -function StartInstance() { - local instance_id="${GARM_INSTANCE_ID}" - if [ -z "${instance_id}" ]; then - echo "missing instance ID in env" - return 1 - fi - - az vm start -g "${instance_id}" -n "${instance_id}" -o none --only-show-errors -} - -function StopServer() { - local instance_id="${GARM_INSTANCE_ID}" - if [ -z "${instance_id}" ]; then - echo "missing instance ID in env" - return 1 - fi - - az vm deallocate -g "${instance_id}" -n "${instance_id}" -o none --only-show-errors -} - -function GetInstance() { - local instance_id="${GARM_INSTANCE_ID}" - info=$(az vm show -d -n $instance_id -g $instance_id -o json --only-show-errors 2>&1) - echo $info | jq -r ' - { - provider_id: .name, - name: .name, - os_type: .tags.os_type, - os_name: .tags.os_name, - os_version: .tags.os_version, - os_arch: .tags.os_arch, - pool_id: .tags.garm_pool_id, - status: {"VM starting": "pending_create", "VM running": "running", "VM stopping": "stopped", "VM stopped": "stopped", "VM deallocating": "stopped", "VM deallocated": "stopped"}[.powerState] - }' -} - -function ListInstances() { - INSTANCES=$(az vm list --query "[?tags.garm_pool_id == '${GARM_POOL_ID}']" -o json --only-show-errors 2>&1) - echo $info | jq -r '[ - .[] | { - provider_id: .name, - name: .name, - os_type: .tags.os_type, - os_name: .tags.os_name, - os_version: .tags.os_version, - os_arch: .tags.os_arch, - pool_id: .tags.garm_pool_id, - status: {"Creating": "pending_create", "Migrating": "pending_create", "Failed": "error", "Succeeded": "running", "Deleting": "pending_delete"}[.provisioningState] - }]' -} - -# Login to Azure -checkValNotNull "${AZURE_SUBSCRIPTION_ID}" "AZURE_SUBSCRIPTION_ID" -checkValNotNull "${AZURE_TENANT_ID}" "AZURE_TENANT_ID" -checkValNotNull "${AZURE_CLIENT_ID}" "AZURE_CLIENT_ID" -checkValNotNull "${AZURE_CLIENT_SECRET}" "AZURE_CLIENT_SECRET" - -export AZURE_CONFIG_DIR="${MYDIR}/.azure" - -az login --service-principal -u $AZURE_CLIENT_ID -p $AZURE_CLIENT_SECRET --tenant $AZURE_TENANT_ID -o none --only-show-errors -az account set -s $AZURE_SUBSCRIPTION_ID -o none --only-show-errors - -case "$GARM_COMMAND" in - "CreateInstance") - CreateInstance - ;; - "DeleteInstance") - DeleteInstance - ;; - "GetInstance") - GetInstance - ;; - "ListInstances") - ListInstances - ;; - "StartInstance") - StartInstance - ;; - "StopInstance") - StopServer - ;; - "RemoveAllInstances") - echo "RemoveAllInstances not implemented" - exit 1 - ;; - *) - echo "Invalid GARM provider command: \"$GARM_COMMAND\"" - exit 1 - ;; -esac diff --git a/contrib/providers.d/openstack/README.md b/contrib/providers.d/openstack/README.md deleted file mode 100644 index 4995e543..00000000 --- a/contrib/providers.d/openstack/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# OpenStack external provider for garm - -This is an example external provider, written for OpenStack. It is a simple bash script that implements the external provider interface, in order to supply ```garm``` with compute instances. This is just an example, complete with a sample config file. - -Not all functions are implemented, just the bare minimum to get it to work with the current feature set of ```garm```. It is not meant for production, as it needs a lot more error checking, retries, and potentially more flexibility to be of any use in a real environment. - -Images that are used with garm require the following properties set on the image: - - * os_type (one of: windows, linux) - * os_distro - * os_version - * architecture (one of: x86_64, armv7l, mips64, mips64el, mips, mipsel) diff --git a/contrib/providers.d/openstack/cloudconfig/install_runner.tpl b/contrib/providers.d/openstack/cloudconfig/install_runner.tpl deleted file mode 100644 index 910d8eac..00000000 --- a/contrib/providers.d/openstack/cloudconfig/install_runner.tpl +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -set -e -set -o pipefail - -METADATA_URL="GARM_METADATA_URL" -CALLBACK_URL="GARM_CALLBACK_URL" -BEARER_TOKEN="GARM_CALLBACK_TOKEN" -DOWNLOAD_URL="GH_DOWNLOAD_URL" -DOWNLOAD_TOKEN="GH_TEMP_DOWNLOAD_TOKEN" -FILENAME="GH_FILENAME" -TARGET_URL="GH_TARGET_URL" -RUNNER_NAME="GH_RUNNER_NAME" -RUNNER_LABELS="GH_RUNNER_LABELS" -TEMP_TOKEN="" - - -if [ -z "$METADATA_URL" ];then - echo "no token is available and METADATA_URL is not set" - exit 1 -fi - -function call() { - PAYLOAD="$1" - curl --fail -s -X POST -d "${PAYLOAD}" -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${CALLBACK_URL}" || echo "failed to call home: exit code ($?)" -} - -function sendStatus() { - MSG="$1" - call "{\"status\": \"installing\", \"message\": \"$MSG\"}" -} - -function success() { - MSG="$1" - ID=$2 - call "{\"status\": \"idle\", \"message\": \"$MSG\", \"agent_id\": $ID}" -} - -function fail() { - MSG="$1" - call "{\"status\": \"failed\", \"message\": \"$MSG\"}" - exit 1 -} - -if [ ! -z "$DOWNLOAD_TOKEN" ]; then - TEMP_TOKEN="Authorization: Bearer $DOWNLOAD_TOKEN" -fi - -sendStatus "downloading tools from ${DOWNLOAD_URL}" -curl --fail -L -H "${TEMP_TOKEN}" -o "/home/runner/${FILENAME}" "${DOWNLOAD_URL}" || fail "failed to download tools" - -mkdir -p /home/runner/actions-runner || fail "failed to create actions-runner folder" - -sendStatus "extracting runner" -tar xf "/home/runner/${FILENAME}" -C /home/runner/actions-runner/ || fail "failed to extract runner" -chown runner:runner -R /home/runner/actions-runner/ || fail "failed to change owner" - -sendStatus "installing dependencies" -cd /home/runner/actions-runner -sudo ./bin/installdependencies.sh || fail "failed to install dependencies" - -sendStatus "fetching runner registration token" -GITHUB_TOKEN=$(curl --fail -s -X GET -H 'Accept: application/json' -H "Authorization: Bearer ${BEARER_TOKEN}" "${METADATA_URL}" || fail "failed to get runner registration token") - -sendStatus "configuring runner" -sudo -u runner -- ./config.sh --unattended --url "${TARGET_URL}" --token "${GITHUB_TOKEN}" --name "${RUNNER_NAME}" --labels "${RUNNER_LABELS}" --ephemeral || fail "failed to configure runner" - -sendStatus "installing runner service" -./svc.sh install runner || fail "failed to install service" - -sendStatus "starting service" -./svc.sh start || fail "failed to start service" - -set +e -AGENT_ID=$(grep "agentId" /home/runner/actions-runner/.runner | tr -d -c 0-9) -if [ $? -ne 0 ];then - fail "failed to get agent ID" -fi -set -e - -success "runner successfully installed" $AGENT_ID \ No newline at end of file diff --git a/contrib/providers.d/openstack/cloudconfig/userdata.tpl b/contrib/providers.d/openstack/cloudconfig/userdata.tpl deleted file mode 100644 index 10ef2b51..00000000 --- a/contrib/providers.d/openstack/cloudconfig/userdata.tpl +++ /dev/null @@ -1,31 +0,0 @@ -#cloud-config -package_upgrade: true -packages: - - curl - - tar -system_info: - default_user: - name: runner - home: /home/runner - shell: /bin/bash - groups: - - sudo - - adm - - cdrom - - dialout - - dip - - video - - plugdev - - netdev - - docker - - lxd - sudo: ALL=(ALL) NOPASSWD:ALL -runcmd: - - /install_runner.sh - - rm -f /install_runner.sh -write_files: - - encoding: b64 - content: RUNNER_INSTALL_B64 - owner: root:root - path: /install_runner.sh - permissions: "755" diff --git a/contrib/providers.d/openstack/garm-external-provider b/contrib/providers.d/openstack/garm-external-provider deleted file mode 100755 index f2602f57..00000000 --- a/contrib/providers.d/openstack/garm-external-provider +++ /dev/null @@ -1,445 +0,0 @@ -#!/bin/bash - -set -e -set -o pipefail - -if [ ! -t 0 ] -then - INPUT=$(cat -) -fi -MYPATH=$(realpath ${BASH_SOURCE[0]}) -MYDIR=$(dirname "${MYPATH}") -TEMPLATES="$MYDIR/cloudconfig" - -# Defaults -# set this variable to 0 in the provider config to disable. -BOOT_FROM_VOLUME=${BOOT_FROM_VOLUME:-1} - -# END Defaults - -if [ -z "$GARM_PROVIDER_CONFIG_FILE" ] -then - echo "no config file specified in env" - exit 1 -fi - -source "$GARM_PROVIDER_CONFIG_FILE" - -declare -A OS_TO_GH_ARCH_MAP -OS_TO_GH_ARCH_MAP["x86_64"]="x64" -OS_TO_GH_ARCH_MAP["armv7l"]="arm64" -OS_TO_GH_ARCH_MAP["mips64"]="arm64" -OS_TO_GH_ARCH_MAP["mips64el"]="arm64" -OS_TO_GH_ARCH_MAP["mips"]="arm" -OS_TO_GH_ARCH_MAP["mipsel"]="arm" - -declare -A OS_TO_GARM_ARCH_MAP -OS_TO_GARM_ARCH_MAP["x86_64"]="amd64" -OS_TO_GARM_ARCH_MAP["armv7l"]="arm64" -OS_TO_GARM_ARCH_MAP["mips64"]="arm64" -OS_TO_GARM_ARCH_MAP["mips64el"]="arm64" -OS_TO_GARM_ARCH_MAP["mips"]="arm" -OS_TO_GARM_ARCH_MAP["mipsel"]="arm" - -declare -A GARM_TO_GH_ARCH_MAP -GARM_TO_GH_ARCH_MAP["amd64"]="x64" -GARM_TO_GH_ARCH_MAP["arm"]="arm" -GARM_TO_GH_ARCH_MAP["arm64"]="arm64" - -declare -A STATUS_MAP -STATUS_MAP["ACTIVE"]="running" -STATUS_MAP["SHUTOFF"]="stopped" -STATUS_MAP["BUILD"]="pending_create" -STATUS_MAP["ERROR"]="error" -STATUS_MAP["DELETING"]="pending_delete" - -function checkValNotNull() { - if [ -z "$1" -o "$1" == "null" ];then - echo "failed to fetch value $2" - return 1 - fi - return 0 -} - -function getOSImageDetails() { - IMAGE_ID=$(echo "$INPUT" | jq -r -c '.image') - OS_IMAGE=$(openstack image show "$IMAGE_ID" -f json) - echo "$OS_IMAGE" -} - -function getOpenStackNetworkID() { - if [ -z "$OPENSTACK_PRIVATE_NETWORK" ] - then - echo "no network specified in config" - return 1 - fi - - NET_ID=$(openstack network show ${OPENSTACK_PRIVATE_NETWORK} -f value -c id) - if [ -z "$NET_ID" ];then - echo "failed to find network $OPENSTACK_PRIVATE_NETWORK" - fi - echo ${NET_ID} -} - -function getVolumeSizeFromFlavor() { - local flavor="$1" - - FLAVOR_DETAILS=$(openstack flavor show "${flavor}" -f json) - DISK_SIZE=$(echo "$FLAVOR_DETAILS" | jq -c -r '.disk') - if [ -z "$DISK_SIZE" ];then - echo "failed to get disk size from flavor" - return 1 - fi - - echo ${DISK_SIZE} -} - -function waitForVolume() { - local volumeName=$1 - set +e - status=$(openstack volume show "${volumeName}" -f json | jq -r -c '.status') - if [ $? -ne 0 ];then - CODE=$? - set -e - return $CODE - fi - set -e - while [ "${status}" != "available" -a "${status}" != "error" ];do - status=$(openstack volume show "${volumeName}" -f json | jq -r -c '.status') - done -} - -function createVolumeFromImage() { - local image="$1" - local disk_size="$2" - local instance_name="$3" - if [ -z ${image} -o -z ${disk_size} -o -z "${instance_name}" ];then - echo "missing image, disk size or instance name in function call" - return 1 - fi - # Instance names contain a UUID. It should be safe to create a volume with the same name and - # expect it to be unique. - set +e - VOLUME_INFO=$(openstack volume create -f json --image "${image}" --size "${disk_size}" "${instance_name}") - if [ $? -ne 0 ]; then - CODE=$? - openstack volume delete "${instance_name}" || true - set -e - return $CODE - fi - waitForVolume "${instance_name}" - echo "${VOLUME_INFO}" -} - -function requestedArch() { - ARCH=$(echo "$INPUT" | jq -c -r '.arch') - checkValNotNull "${ARCH}" "arch" || return $? - echo "${ARCH}" -} - -function downloadURL() { - [ -z "$1" -o -z "$2" ] && return 1 - GH_ARCH="${GARM_TO_GH_ARCH_MAP[$2]}" - URL=$(echo "$INPUT" | jq -c -r --arg OS "$1" --arg ARCH "$GH_ARCH" '(.tools[] | select( .os == $OS and .architecture == $ARCH)).download_url') - checkValNotNull "${URL}" "download URL" || return $? - echo "${URL}" -} - -function tempDownloadToken() { - [ -z "$1" -o -z "$2" ] && return 1 - GH_ARCH="${GARM_TO_GH_ARCH_MAP[$2]}" - TOKEN=$(echo "$INPUT" | jq -c -r --arg OS "$1" --arg ARCH "$GH_ARCH" '(.tools[] | select( .os == $OS and .architecture == $ARCH)).temp_download_token') - echo "${TOKEN}" -} - -function runnerTokenURL() { - METADATA_URL=$(echo "$INPUT" | jq -c -r '."metadata-url"') - checkValNotNull "${METADATA_URL}" "metadata-url" || return $? - echo "${METADATA_URL}/runner-registration-token/" -} - -function downloadFilename() { - [ -z "$1" -o -z "$2" ] && return 1 - GH_ARCH="${GARM_TO_GH_ARCH_MAP[$2]}" - FN=$(echo "$INPUT" | jq -c -r --arg OS "$1" --arg ARCH "$GH_ARCH" '(.tools[] | select( .os == $OS and .architecture == $ARCH)).filename') - checkValNotNull "${FN}" "download filename" || return $? - echo "${FN}" -} - -function poolID() { - POOL_ID=$(echo "$INPUT" | jq -c -r '.pool_id') - checkValNotNull "${POOL_ID}" "pool_id" || return $? - echo "${POOL_ID}" -} - -function flavor() { - FLAVOR=$(echo "$INPUT" | jq -c -r '.flavor') - checkValNotNull "${FLAVOR}" "flavor" || return $? - echo "${FLAVOR}" -} - -function image() { - IMG=$(echo "$INPUT" | jq -c -r '.image') - checkValNotNull "${IMG}" "image" || return $? - echo "${IMG}" -} - -function repoURL() { - REPO=$(echo "$INPUT" | jq -c -r '.repo_url') - checkValNotNull "${REPO}" "repo_url" || return $? - echo "${REPO}" -} - -function callbackURL() { - CB_URL=$(echo "$INPUT" | jq -c -r '."callback-url"') - checkValNotNull "${CB_URL}" "callback-url" || return $? - echo "${CB_URL}" -} - -function callbackToken() { - CB_TK=$(echo "$INPUT" | jq -c -r '."instance-token"') - checkValNotNull "${CB_TK}" "instance-token" || return $? - echo "${CB_TK}" -} - -function instanceName() { - NAME=$(echo "$INPUT" | jq -c -r '.name') - checkValNotNull "${NAME}" "name" || return $? - echo "${NAME}" -} - -function labels() { - LBL=$(echo "$INPUT" | jq -c -r '.labels | join(",")') - checkValNotNull "${LBL}" "labels" || return $? - echo "${LBL}" -} - -function getCloudConfig() { - IMAGE_DETAILS=$(getOSImageDetails) - - OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_type') - checkValNotNull "${OS_TYPE}" "os_type" || return $? - - ARCH=$(requestedArch) - DW_URL=$(downloadURL "${OS_TYPE}" "${ARCH}") - DW_TOKEN=$(tempDownloadToken "${OS_TYPE}" "${ARCH}") - DW_FILENAME=$(downloadFilename "${OS_TYPE}" "${ARCH}") - LABELS=$(labels) - - TMP_SCRIPT=$(mktemp) - TMP_CC=$(mktemp) - - INSTALL_TPL=$(cat "${TEMPLATES}/install_runner.tpl") - CC_TPL=$(cat "${TEMPLATES}/userdata.tpl") - echo "$INSTALL_TPL" | sed -e "s|GARM_CALLBACK_URL|$(callbackURL)|g" \ - -e "s|GARM_CALLBACK_TOKEN|$(callbackToken)|g" \ - -e "s|GH_DOWNLOAD_URL|${DW_URL}|g" \ - -e "s|GH_FILENAME|${DW_FILENAME}|g" \ - -e "s|GH_TARGET_URL|$(repoURL)|g" \ - -e "s|GARM_METADATA_URL|$(runnerTokenURL)|g" \ - -e "s|GH_RUNNER_NAME|$(instanceName)|g" \ - -e "s|GH_TEMP_DOWNLOAD_TOKEN|${DW_TOKEN}|g" \ - -e "s|GH_RUNNER_LABELS|${LABELS}|g" > ${TMP_SCRIPT} - - AS_B64=$(base64 -w0 ${TMP_SCRIPT}) - echo "${CC_TPL}" | sed "s|RUNNER_INSTALL_B64|${AS_B64}|g" > ${TMP_CC} - echo "${TMP_CC}" -} - -function waitForServer() { - local srv_id="$1" - - srv_info=$(openstack server show -f json "${srv_id}") - [ $? -ne 0 ] && return $? - - status=$(echo "${srv_info}" | jq -r -c '.status') - - while [ "${status}" != "ERROR" -a "${status}" != "ACTIVE" ];do - sleep 0.5 - srv_info=$(openstack server show -f json "${srv_id}") - [ $? -ne 0 ] && return $? - status=$(echo "${srv_info}" | jq -r -c '.status') - done - echo "${srv_info}" -} - -function CreateInstance() { - if [ -z "$INPUT" ];then - echo "expected build params in stdin" - exit 1 - fi - - CC_FILE=$(getCloudConfig) - FLAVOR=$(flavor) - IMAGE=$(image) - INSTANCE_NAME=$(instanceName) - NET=$(getOpenStackNetworkID) - IMAGE_DETAILS=$(getOSImageDetails) - - OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_type') - checkValNotNull "${OS_TYPE}" "os_type" || return $? - DISTRO=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_distro') - checkValNotNull "${DISTRO}" "os_distro" || return $? - VERSION=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_version') - checkValNotNull "${VERSION}" "os_version" || return $? - ARCH=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.architecture') - checkValNotNull "${ARCH}" "architecture" || return $? - GH_ARCH=${OS_TO_GH_ARCH_MAP[${ARCH}]} - - if [ -z "${GH_ARCH}" ];then - GH_ARCH=${ARCH} - fi - - SOURCE_ARGS="" - - if [ "${BOOT_FROM_VOLUME}" -eq 1 ];then - VOL_SIZE=$(getVolumeSizeFromFlavor "${FLAVOR}") - VOL_INFO=$(createVolumeFromImage "${IMAGE}" "${VOL_SIZE}" "${INSTANCE_NAME}") - if [ $? -ne 0 ];then - openstack volume delete "${INSTANCE_NAME}" || true - fi - SOURCE_ARGS="--volume ${INSTANCE_NAME}" - else - SOURCE_ARGS="--image ${IMAGE}" - fi - - set +e - - TAGS="--tag garm-controller-id=${GARM_CONTROLLER_ID} --tag garm-pool-id=${GARM_POOL_ID}" - PROPERTIES="--property os_type=${OS_TYPE} --property os_name=${DISTRO} --property os_version=${VERSION} --property os_arch=${GH_ARCH} --property pool_id=${GARM_POOL_ID}" - SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} ${PROPERTIES} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}") - if [ $? -ne 0 ];then - openstack volume delete "${INSTANCE_NAME}" || true - exit 1 - fi - SRV_DETAILS=$(waitForServer "${INSTANCE_NAME}") - if [ $? -ne 0 ];then - CODE=$? - # cleanup - rm -f "${CC_FILE}" || true - openstack server delete "${INSTANCE_NAME}" || true - openstack volume delete "${INSTANCE_NAME}" || true - set -e - FAULT=$(echo "${SRV_DETAILS}"| jq -rc '.fault') - echo "Failed to create server: ${FAULT}" - exit $CODE - fi - set -e - rm -f "${CC_FILE}" || true - - SRV_ID=$(echo "${SRV_DETAILS}" | jq -r -c '.id') - STATUS=$(echo "${SRV_DETAILS}" | jq -r -c '.status') - FAULT=$(echo "${SRV_DETAILS}" | jq -r -c '.fault') - FAULT_VAL="" - if [ ! -z "${FAULT}" -a "${FAULT}" != "null" ];then - FAULT_VAL=$(echo "${FAULT}" | base64 -w0) - fi - - jq -rnc \ - --arg PROVIDER_ID ${SRV_ID} \ - --arg NAME "${INSTANCE_NAME}" \ - --arg OS_TYPE "${OS_TYPE}" \ - --arg OS_NAME "${DISTRO}" \ - --arg OS_VERSION "${VERSION}" \ - --arg ARCH "${GH_ARCH}" \ - --arg STATUS "${STATUS_MAP[${STATUS}]}" \ - --arg POOL_ID "${GARM_POOL_ID}" \ - --arg FAULT "${FAULT_VAL}" \ - '{"provider_id": $PROVIDER_ID, "name": $NAME, "os_type": $OS_TYPE, "os_name": $OS_NAME, "os_version": $OS_VERSION, "os_arch": $ARCH, "status": $STATUS, "pool_id": $POOL_ID, "provider_fault": $FAULT}' -} - -function DeleteInstance() { - local instance_id="${GARM_INSTANCE_ID}" - if [ -z "${instance_id}" ];then - echo "missing instance ID in env" - return 1 - fi - - set +e - instance_info=$(openstack server show "${instance_id}" -f json 2>&1) - if [ $? -ne 0 ];then - CODE=$? - set -e - if [ "${instance_info}" == "No server with a name or ID of*" ];then - return 0 - fi - return $CODE - fi - set -e - VOLUMES=$(echo "${instance_info}" | jq -r -c '.volumes_attached[] | .id') - - openstack server delete "${instance_id}" - for vol in "$VOLUMES";do - waitForVolume "${vol}" - openstack volume delete $vol || true - done -} - -function StartInstance() { - local instance_id="${GARM_INSTANCE_ID}" - if [ -z "${instance_id}" ];then - echo "missing instance ID in env" - return 1 - fi - - openstack server start "${instance_id}" -} - -function StopServer() { - local instance_id="${GARM_INSTANCE_ID}" - if [ -z "${instance_id}" ];then - echo "missing instance ID in env" - return 1 - fi - - openstack server stop "${instance_id}" -} - -function ListInstances() { - INSTANCES=$(openstack server list --os-compute-api-version 2.52 --tags garm-pool-id=${GARM_POOL_ID} --long -f json) - echo ${INSTANCES} | jq -r '[ - .[] | .Properties * { - provider_id: .ID, - name: .Name, - status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.Status] - }]' -} - -function GetInstance() { - INSTANCE=$(openstack server show --os-compute-api-version 2.52 ${GARM_INSTANCE_ID} -f json) - echo ${INSTANCES} | jq -r '.properties * { - provider_id: .id, - name: .name, - status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.status] - }' -} - -case "$GARM_COMMAND" in - "CreateInstance") - CreateInstance - ;; - "DeleteInstance") - DeleteInstance - ;; - "GetInstance") - GetInstance - ;; - "ListInstances") - ListInstances - ;; - "StartInstance") - StartInstance - ;; - "StopInstance") - StopServer - ;; - "RemoveAllInstances") - echo "RemoveAllInstances not implemented" - exit 1 - ;; - *) - echo "Invalid GARM provider command: \"$GARM_COMMAND\"" - exit 1 - ;; -esac - diff --git a/contrib/providers.d/openstack/keystonerc b/contrib/providers.d/openstack/keystonerc deleted file mode 100644 index 1b702dd7..00000000 --- a/contrib/providers.d/openstack/keystonerc +++ /dev/null @@ -1,16 +0,0 @@ -# OpenStack client config -export OS_REGION_NAME=RegionOne -export OS_AUTH_VERSION=3 -export OS_AUTH_URL=http://10.0.8.36:5000/v3 -export OS_PROJECT_DOMAIN_NAME=admin_domain -export OS_USERNAME=admin -export OS_AUTH_TYPE=password -export OS_USER_DOMAIN_NAME=admin_domain -export OS_PROJECT_NAME=admin -export OS_PASSWORD=Iegeehahth4suSie -export OS_IDENTITY_API_VERSION=3 - - -# GARM config -export OPENSTACK_PRIVATE_NETWORK="int_net" -export BOOT_FROM_VOLUME=1 diff --git a/doc/config.md b/doc/config.md new file mode 100644 index 00000000..8b4d3a05 --- /dev/null +++ b/doc/config.md @@ -0,0 +1,480 @@ +# Configuration + +The ```GARM``` configuration is a simple ```toml```. The sample config file in [the testdata folder](/testdata/config.toml) is fairly well commented and should be enough to get you started. The configuration file is split into several sections, each of which is documented in its own page. The sections are: + + + +- [Configuration](#configuration) + - [The default config section](#the-default-config-section) + - [The callback_url option](#the-callback_url-option) + - [The metadata_url option](#the-metadata_url-option) + - [The debug_server option](#the-debug_server-option) + - [The log_file option](#the-log_file-option) + - [Rotating log files](#rotating-log-files) + - [The enable_log_streamer option](#the-enable_log_streamer-option) + - [The logging section](#the-logging-section) + - [Database configuration](#database-configuration) + - [Provider configuration](#provider-configuration) + - [Providers](#providers) + - [Available external providers](#available-external-providers) + - [The metrics section](#the-metrics-section) + - [Common metrics](#common-metrics) + - [Enterprise metrics](#enterprise-metrics) + - [Organization metrics](#organization-metrics) + - [Repository metrics](#repository-metrics) + - [Provider metrics](#provider-metrics) + - [Pool metrics](#pool-metrics) + - [Runner metrics](#runner-metrics) + - [Github metrics](#github-metrics) + - [Enabling metrics](#enabling-metrics) + - [Configuring prometheus](#configuring-prometheus) + - [The JWT authentication config section](#the-jwt-authentication-config-section) + - [The API server config section](#the-api-server-config-section) + + + +## The default config section + +The `default` config section holds configuration options that don't need a category of their own, but are essential to the operation of the service. In this section we will detail each of the options available in the `default` section. + +```toml +[default] +# Uncomment this line if you'd like to log to a file instead of standard output. +# log_file = "/tmp/runner-manager.log" + +# Enable streaming logs via web sockets. Use garm-cli debug-log. +enable_log_streamer = false + +# Enable the golang debug server. See the documentation in the "doc" folder for more information. +debug_server = false +``` + +### The callback_url option + +Your runners will call back home with status updates as they install. Once they are set up, they will also send the GitHub agent ID they were allocated. You will need to configure the ```callback_url``` option in the ```garm``` server config. This URL needs to point to the following API endpoint: + + ```txt + POST /api/v1/callbacks/status + ``` + +Example of a runner sending status updates: + + ```bash + garm-cli runner show garm-DvxiVAlfHeE7 + +-----------------+------------------------------------------------------------------------------------+ + | FIELD | VALUE | + +-----------------+------------------------------------------------------------------------------------+ + | ID | 16b96ba2-d406-45b8-ab66-b70be6237b4e | + | Provider ID | garm-DvxiVAlfHeE7 | + | Name | garm-DvxiVAlfHeE7 | + | OS Type | linux | + | OS Architecture | amd64 | + | OS Name | ubuntu | + | OS Version | jammy | + | Status | running | + | Runner Status | idle | + | Pool ID | 8ec34c1f-b053-4a5d-80d6-40afdfb389f9 | + | Addresses | 10.198.117.120 | + | Status Updates | 2023-07-08T06:26:46: runner registration token was retrieved | + | | 2023-07-08T06:26:46: using cached runner found in /opt/cache/actions-runner/latest | + | | 2023-07-08T06:26:50: configuring runner | + | | 2023-07-08T06:26:56: runner successfully configured after 1 attempt(s) | + | | 2023-07-08T06:26:56: installing runner service | + | | 2023-07-08T06:26:56: starting service | + | | 2023-07-08T06:26:57: runner successfully installed | + +-----------------+------------------------------------------------------------------------------------+ + + ``` + +This URL must be set and must be accessible by the instance. If you wish to restrict access to it, a reverse proxy can be configured to accept requests only from networks in which the runners ```garm``` manages will be spun up. This URL doesn't need to be globally accessible, it just needs to be accessible by the instances. + +For example, in a scenario where you expose the API endpoint directly, this setting could look like the following: + + ```toml + callback_url = "https://garm.example.com/api/v1/callbacks" + ``` + +Authentication is done using a short-lived JWT token, that gets generated for a particular instance that we are spinning up. That JWT token grants access to the instance to only update its own status and to fetch metadata for itself. No other API endpoints will work with that JWT token. The validity of the token is equal to the pool bootstrap timeout value (default 20 minutes) plus the garm polling interval (5 minutes). + +There is a sample ```nginx``` config [in the testdata folder](/testdata/nginx-server.conf). Feel free to customize it in any way you see fit. + +### The metadata_url option + +The metadata URL is the base URL for any information an instance may need to fetch in order to finish setting itself up. As this URL may be placed behind a reverse proxy, you'll need to configure it in the ```garm``` config file. Ultimately this URL will need to point to the following ```garm``` API endpoint: + + ```bash + GET /api/v1/metadata + ``` + +This URL needs to be accessible only by the instances ```garm``` sets up. This URL will not be used by anyone else. To configure it in ```garm``` add the following line in the ```[default]``` section of your ```garm``` config: + + ```toml + metadata_url = "https://garm.example.com/api/v1/metadata" + ``` + +### The debug_server option + +GARM can optionally enable the golang profiling server. This is useful if you suspect garm may be have a bottleneck in any way. To enable the profiling server, add the following section to the garm config: + +```toml +[default] + +debug_server = true +``` + +And restart garm. You can then use the following command to start profiling: + +```bash +go tool pprof http://127.0.0.1:9997/debug/pprof/profile?seconds=120 +``` + +> **IMPORTANT NOTE on profiling when behind a reverse proxy**: The above command will hang for a fairly long time. Most reverse proxies will timeout after about 60 seconds. To avoid this, you should only profile on localhost by connecting directly to garm. + +It's also advisable to exclude the debug server URLs from your reverse proxy and only make them available locally. + +Now that the debug server is enabled, here is a blog post on how to profile golang applications: https://blog.golang.org/profiling-go-programs + + +### The log_file option + +By default, GARM logs everything to standard output. + +You can optionally log to file by adding the following to your config file: + +```toml +[default] +# Use this if you'd like to log to a file instead of standard output. +log_file = "/tmp/runner-manager.log" +``` + +#### Rotating log files + +GARM automatically rotates the log if it reaches 500 MB in size or 28 days, whichever comes first. + +However, if you want to manually rotate the log file, you can send a `SIGHUP` signal to the GARM process. + +You can add the following to your systemd unit file to enable `reload`: + +```ini +[Service] +ExecReload=/bin/kill -HUP $MAINPID +``` + +Then you can simply: + +```bash +systemctl reload garm +``` + +### The enable_log_streamer option + +This option allows you to stream garm logs directly to your terminal. Set this option to true, then you can use the following command to stream logs: + +```bash +garm-cli debug-log +``` + +An important note on enabling this option when behind a reverse proxy. The log streamer uses websockets to stream logs to you. You will need to configure your reverse proxy to allow websocket connections. If you're using nginx, you will need to add the following to your nginx `server` config: + +```nginx +location /api/v1/ws { + proxy_pass http://garm_backend; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_set_header Host $host; +} +``` + +## The logging section + +GARM has switched to the `slog` package for logging, adding structured logging. As such, we added a dedicated `logging` section to the config to tweak the logging settings. We moved the `enable_log_streamer` and the `log_file` options from the `default` section to the `logging` section. They are still available in the `default` section for backwards compatibility, but they are deprecated and will be removed in a future release. + +An example of the new `logging` section: + +```toml +[logging] +# Uncomment this line if you'd like to log to a file instead of standard output. +# log_file = "/tmp/runner-manager.log" + +# enable_log_streamer enables streaming the logs over websockets +enable_log_streamer = true +# log_format is the output format of the logs. GARM uses structured logging and can +# output as "text" or "json" +log_format = "text" +# log_level is the logging level GARM will output. Available log levels are: +# * debug +# * info +# * warn +# * error +log_level = "debug" +# log_source will output information about the function that generated the log line. +log_source = false +``` + +By default GARM logs everything to standard output. You can optionally log to file by adding the `log_file` option to the `logging` section. The `enable_log_streamer` option allows you to stream GARM logs directly to your terminal. Set this option to `true`, then you can use the following command to stream logs: + +```bash +garm-cli debug-log +``` + +The `log_format`, `log_level` and `log_source` options allow you to tweak the logging output. The `log_format` option can be set to `text` or `json`. The `log_level` option can be set to `debug`, `info`, `warn` or `error`. The `log_source` option will output information about the function that generated the log line. All these options influence how the structured logging is output. + +This will allow you to ingest GARM logs in a central location such as an ELK stack or similar. + +## Database configuration + +GARM currently supports SQLite3. Support for other stores will be added in the future. + +```toml +[database] + # Turn on/off debugging for database queries. + debug = false + # Database backend to use. Currently supported backends are: + # * sqlite3 + backend = "sqlite3" + # the passphrase option is a temporary measure by which we encrypt the webhook + # secret that gets saved to the database, using AES256. In the future, secrets + # will be saved to something like Barbican or Vault, eliminating the need for + # this. This string needs to be 32 characters in size. + passphrase = "shreotsinWadquidAitNefayctowUrph" + [database.sqlite3] + # Path on disk to the sqlite3 database file. + db_file = "/home/runner/garm.db" +``` + +## Provider configuration + +GARM was designed to be extensible. Providers can be written as external executables which implement the needed interface to create/delete/list compute systems that are used by ```GARM``` to create runners. + +### Providers + +GARM delegates the functionality needed to create the runners to external executables. These executables can be either binaries or scripts. As long as they adhere to the needed interface, they can be used to create runners in any target IaaS. You might find this behavior familiar if you've ever had to deal with installing `CNIs` in `containerd`. The principle is the same. + +The configuration for an external provider is quite simple: + +```toml +# This is an example external provider. External providers are executables that +# implement the needed interface to create/delete/list compute systems that are used +# by GARM to create runners. +[[provider]] +name = "openstack_external" +description = "external openstack provider" +provider_type = "external" + [provider.external] + # config file passed to the executable via GARM_PROVIDER_CONFIG_FILE environment variable + config_file = "/etc/garm/providers.d/openstack/keystonerc" + # Absolute path to an executable that implements the provider logic. This executable can be + # anything (bash, a binary, python, etc). See documentation in this repo on how to write an + # external provider. + provider_executable = "/etc/garm/providers.d/openstack/garm-external-provider" + # This option will pass all environment variables that start with AWS_ to the provider. + # To pass in individual variables, you can add the entire name to the list. + environment_variables = ["AWS_"] +``` + +The external provider has three options: + +* `provider_executable` +* `config_file` +* `environment_variables` + +The ```provider_executable``` option is the absolute path to an executable that implements the provider logic. GARM will delegate all provider operations to this executable. This executable can be anything (bash, python, perl, go, etc). See [Writing an external provider](./external_provider.md) for more details. + +The ```config_file``` option is a path on disk to an arbitrary file, that is passed to the external executable via the environment variable ```GARM_PROVIDER_CONFIG_FILE```. This file is only relevant to the external provider. GARM itself does not read it. Let's take the [OpenStack provider](https://github.com/cloudbase/garm-provider-openstack) as an example. The [config file](https://github.com/cloudbase/garm-provider-openstack/blob/ac46d4d5a542bca96cd0309c89437d3382c3ea26/testdata/config.toml) contains access information for an OpenStack cloud as well as some provider specific options like whether or not to boot from volume and which tenant network to use. + +The `environment_variables` option is a list of environment variables that will be passed to the external provider. By default GARM will pass a clean env to providers, consisting only of variables that the [provider interface](./external_provider.md) expects. However, in some situations, provider may need access to certain environment variables set in the env of GARM itself. This might be needed to enable access to IAM roles (ec2) or managed identity (azure). This option takes a list of environment variables or prefixes of environment variables that will be passed to the provider. For example, if you want to pass all environment variables that start with `AWS_` to the provider, you can set this option to `["AWS_"]`. + +If you want to implement an external provider, you can use this file for anything you need to pass into the binary when ```GARM``` calls it to execute a particular operation. + +#### Available external providers + +For non-testing purposes, these are the external providers currently available: + +* [OpenStack](https://github.com/cloudbase/garm-provider-openstack) +* [Azure](https://github.com/cloudbase/garm-provider-azure) +* [Kubernetes](https://github.com/mercedes-benz/garm-provider-k8s) - Thanks to the amazing folks at @mercedes-benz for sharing their awesome provider! +* [LXD](https://github.com/cloudbase/garm-provider-lxd) +* [Incus](https://github.com/cloudbase/garm-provider-incus) +* [Equinix Metal](https://github.com/cloudbase/garm-provider-equinix) +* [Amazon EC2](https://github.com/cloudbase/garm-provider-aws) +* [Google Cloud Platform (GCP)](https://github.com/cloudbase/garm-provider-gcp) +* [Oracle Cloud Infrastructure (OCI)](https://github.com/cloudbase/garm-provider-oci) + +Details on how to install and configure them are available in their respective repositories. + +If you wrote a provider and would like to add it to the above list, feel free to open a PR. + + +## The metrics section + +This is one of the features in GARM that I really love having. For one thing, it's community contributed and for another, it really adds value to the project. It allows us to create some pretty nice visualizations of what is happening with GARM. + +### Common metrics + +| Metric name | Type | Labels | Description | +|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| +| `garm_health` | Gauge | `controller_id`=<controller id>
`callback_url`=<callback url>
`controller_webhook_url`=<controller webhook url>
`metadata_url`=<metadata url>
`webhook_url`=<webhook url>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. | +| `garm_webhooks_received` | Counter | `valid`=<valid request>
`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. | + +### Enterprise metrics + +| Metric name | Type | Labels | Description | +|---------------------------------------|-------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------| +| `garm_enterprise_info` | Gauge | `id`=<enterprise id>
`name`=<enterprise name> | This is a gauge that is set to 1 and expose enterprise information | +| `garm_enterprise_pool_manager_status` | Gauge | `id`=<enterprise id>
`name`=<enterprise name>
`running`=<true\|false> | This is a gauge that is set to 1 if the enterprise pool manager is running and set to 0 if not | + +### Organization metrics + +| Metric name | Type | Labels | Description | +|-----------------------------------------|-------|-----------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| +| `garm_organization_info` | Gauge | `id`=<organization id>
`name`=<organization name> | This is a gauge that is set to 1 and expose organization information | +| `garm_organization_pool_manager_status` | Gauge | `id`=<organization id>
`name`=<organization name>
`running`=<true\|false> | This is a gauge that is set to 1 if the organization pool manager is running and set to 0 if not | + +### Repository metrics + +| Metric name | Type | Labels | Description | +|---------------------------------------|-------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------| +| `garm_repository_info` | Gauge | `id`=<repository id>
`name`=<repository name> | This is a gauge that is set to 1 and expose repository information | +| `garm_repository_pool_manager_status` | Gauge | `id`=<repository id>
`name`=<repository name>
`running`=<true\|false> | This is a gauge that is set to 1 if the repository pool manager is running and set to 0 if not | + +### Provider metrics + +| Metric name | Type | Labels | Description | +|----------------------|-------|-------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------| +| `garm_provider_info` | Gauge | `description`=<provider description>
`name`=<provider name>
`type`=<internal\|external> | This is a gauge that is set to 1 and expose provider information | + +### Pool metrics + +| Metric name | Type | Labels | Description | +|-------------------------------|-------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| +| `garm_pool_info` | Gauge | `flavor`=<flavor>
`id`=<pool id>
`image`=<image name>
`os_arch`=<defined OS arch>
`os_type`=<defined OS name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`prefix`=<prefix>
`provider`=<provider name>
`tags`=<concatenated list of pool tags>
| This is a gauge that is set to 1 and expose pool information | +| `garm_pool_status` | Gauge | `enabled`=<true\|false>
`id`=<pool id> | This is a gauge that is set to 1 if the pool is enabled and set to 0 if not | +| `garm_pool_bootstrap_timeout` | Gauge | `id`=<pool id> | This is a gauge that is set to the pool bootstrap timeout | +| `garm_pool_max_runners` | Gauge | `id`=<pool id> | This is a gauge that is set to the pool max runners | +| `garm_pool_min_idle_runners` | Gauge | `id`=<pool id> | This is a gauge that is set to the pool min idle runners | + +### Runner metrics + +| Metric name | Type | Labels | Description | +|--------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| +| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns | +| `garm_runner_operations_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation is performed | +| `garm_runner_errors_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation errored | + +### Github metrics + +| Metric name | Type | Labels | Description | +|--------------------------------|---------|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| +| `garm_github_operations_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation is performed | +| `garm_github_errors_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation errored | + +### Enabling metrics + +Metrics are disabled by default. To enable them, add the following to your config file: + +```toml +[metrics] + +# Toggle to disable authentication (not recommended) on the metrics endpoint. +# If you do disable authentication, I encourage you to put a reverse proxy in front +# of garm and limit which systems can access that particular endpoint. Ideally, you +# would enable some kind of authentication using the reverse proxy, if the built-in auth +# is not sufficient for your needs. +# +# Default: false +disable_auth = true + +# Toggle metrics. If set to false, the API endpoint for metrics collection will +# be disabled. +# +# Default: false +enable = true + +# period is the time interval when the /metrics endpoint will update internal metrics about +# controller specific objects (e.g. runners, pools, etc.) +# +# Default: "60s" +period = "30s" +``` + +You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it. + +### Configuring prometheus + +The following section assumes that your garm instance is running at `garm.example.com` and has TLS enabled. + +First, generate a new JWT token valid only for the metrics endpoint: + +```bash +garm-cli metrics-token create +``` + +Note: The token validity is equal to the TTL you set in the [JWT config section](#the-jwt-authentication-config-section). + +Copy the resulting token, and add it to your prometheus config file. The following is an example of how to add garm as a target in your prometheus config file: + +```yaml +scrape_configs: + - job_name: "garm" + # Connect over https. If you don't have TLS enabled, change this to http. + scheme: https + static_configs: + - targets: ["garm.example.com"] + authorization: + credentials: "superSecretTokenYouGeneratedEarlier" +``` + +## The JWT authentication config section + +This section configures the JWT authentication used by the API server. GARM is currently a single user system and that user has the right to do anything and everything GARM is capable of. As a result, the JWT auth we have does not include a refresh token. The token is valid for the duration of the time to live (TTL) set in the config file. Once the token expires, you will need to log in again. + +It is recommended that the secret be a long, randomly generated string. Changing the secret at any time will invalidate all existing tokens. + +```toml +[jwt_auth] +# A JWT token secret used to sign tokens. Obviously, this needs to be changed :). +secret = ")9gk_4A6KrXz9D2u`0@MPea*sd6W`%@5MAWpWWJ3P3EqW~qB!!(Vd$FhNc*eU4vG" + +# Time to live for tokens. Both the instances and you will use JWT tokens to +# authenticate against the API. However, this TTL is applied only to tokens you +# get when logging into the API. The tokens issued to the instances we manage, +# have a TTL based on the runner bootstrap timeout set on each pool. The minimum +# TTL for this token is 24h. +time_to_live = "8760h" +``` + +## The API server config section + +This section allows you to configure the GARM API server. The API server is responsible for serving all the API endpoints used by the `garm-cli`, the runners that phone home their status and by GitHub when it sends us webhooks. + +The config options are fairly straight forward. + +```toml +[apiserver] + # Bind the API to this IP + bind = "0.0.0.0" + # Bind the API to this port + port = 9997 + # Whether or not to set up TLS for the API endpoint. If this is set to true, + # you must have a valid apiserver.tls section. + use_tls = false + # Set a list of allowed origins + # By default, if this option is omitted or empty, we will check + # only that the origin is the same as the originating server. + # A literal of "*" will allow any origin + cors_origins = ["*"] + [apiserver.tls] + # Path on disk to a x509 certificate bundle. + # NOTE: if your certificate is signed by an intermediary CA, this file + # must contain the entire certificate bundle needed for clients to validate + # the certificate. This usually means concatenating the certificate and the + # CA bundle you received. + certificate = "" + # The path on disk to the corresponding private key for the certificate. + key = "" +``` + +The GARM API server has the option to enable TLS, but I suggest you use a reverse proxy and enable TLS termination in that reverse proxy. There is an `nginx` sample in this repository with TLS termination enabled. + +You can of course enable TLS in both garm and the reverse proxy. The choice is yours. \ No newline at end of file diff --git a/doc/config_api_server.md b/doc/config_api_server.md deleted file mode 100644 index 7c2703f6..00000000 --- a/doc/config_api_server.md +++ /dev/null @@ -1,34 +0,0 @@ -# The API server config section - -This section allows you to configure the GARM API server. The API server is responsible for serving all the API endpoints used by the `garm-cli`, the runners that phone home their status and by GitHub when it sends us webhooks. - -The config options are fairly straight forward. - -```toml -[apiserver] - # Bind the API to this IP - bind = "0.0.0.0" - # Bind the API to this port - port = 9997 - # Whether or not to set up TLS for the API endpoint. If this is set to true, - # you must have a valid apiserver.tls section. - use_tls = false - # Set a list of allowed origins - # By default, if this option is omitted or empty, we will check - # only that the origin is the same as the originating server. - # A literal of "*" will allow any origin - cors_origins = ["*"] - [apiserver.tls] - # Path on disk to a x509 certificate bundle. - # NOTE: if your certificate is signed by an intermediary CA, this file - # must contain the entire certificate bundle needed for clients to validate - # the certificate. This usually means concatenating the certificate and the - # CA bundle you received. - certificate = "" - # The path on disk to the corresponding private key for the certificate. - key = "" -``` - -The GARM API server has the option to enable TLS, but I suggest you use a reverse proxy and enable TLS termination in that reverse proxy. There is an `nginx` sample in this repository with TLS termination enabled. - -You can of course enable TLS in both garm and the reverse proxy. The choice is yours. \ No newline at end of file diff --git a/doc/config_default.md b/doc/config_default.md deleted file mode 100644 index 00dc8a7d..00000000 --- a/doc/config_default.md +++ /dev/null @@ -1,152 +0,0 @@ -# The default config section - -The `default` config section holds configuration options that don't need a category of their own, but are essential to the operation of the service. In this section we will detail each of the options available in the `default` section. - -```toml -[default] -# Uncomment this line if you'd like to log to a file instead of standard output. -# log_file = "/tmp/runner-manager.log" - -# Enable streaming logs via web sockets. Use garm-cli debug-log. -enable_log_streamer = false - -# Enable the golang debug server. See the documentation in the "doc" folder for more information. -debug_server = false -``` - -## The callback_url option - -Your runners will call back home with status updates as they install. Once they are set up, they will also send the GitHub agent ID they were allocated. You will need to configure the ```callback_url``` option in the ```garm``` server config. This URL needs to point to the following API endpoint: - - ```txt - POST /api/v1/callbacks/status - ``` - -Example of a runner sending status updates: - - ```bash - garm-cli runner show garm-DvxiVAlfHeE7 - +-----------------+------------------------------------------------------------------------------------+ - | FIELD | VALUE | - +-----------------+------------------------------------------------------------------------------------+ - | ID | 16b96ba2-d406-45b8-ab66-b70be6237b4e | - | Provider ID | garm-DvxiVAlfHeE7 | - | Name | garm-DvxiVAlfHeE7 | - | OS Type | linux | - | OS Architecture | amd64 | - | OS Name | ubuntu | - | OS Version | jammy | - | Status | running | - | Runner Status | idle | - | Pool ID | 8ec34c1f-b053-4a5d-80d6-40afdfb389f9 | - | Addresses | 10.198.117.120 | - | Status Updates | 2023-07-08T06:26:46: runner registration token was retrieved | - | | 2023-07-08T06:26:46: using cached runner found in /opt/cache/actions-runner/latest | - | | 2023-07-08T06:26:50: configuring runner | - | | 2023-07-08T06:26:56: runner successfully configured after 1 attempt(s) | - | | 2023-07-08T06:26:56: installing runner service | - | | 2023-07-08T06:26:56: starting service | - | | 2023-07-08T06:26:57: runner successfully installed | - +-----------------+------------------------------------------------------------------------------------+ - - ``` - -This URL must be set and must be accessible by the instance. If you wish to restrict access to it, a reverse proxy can be configured to accept requests only from networks in which the runners ```garm``` manages will be spun up. This URL doesn't need to be globally accessible, it just needs to be accessible by the instances. - -For example, in a scenario where you expose the API endpoint directly, this setting could look like the following: - - ```toml - callback_url = "https://garm.example.com/api/v1/callbacks" - ``` - -Authentication is done using a short-lived JWT token, that gets generated for a particular instance that we are spinning up. That JWT token grants access to the instance to only update it's own status and to fetch metadata for itself. No other API endpoints will work with that JWT token. The validity of the token is equal to the pool bootstrap timeout value (default 20 minutes) plus the garm polling interval (5 minutes). - -There is a sample ```nginx``` config [in the testdata folder](/testdata/nginx-server.conf). Feel free to customize it whichever way you see fit. - -## The metadata_url option - -The metadata URL is the base URL for any information an instance may need to fetch in order to finish setting itself up. As this URL may be placed behind a reverse proxy, you'll need to configure it in the ```garm``` config file. Ultimately this URL will need to point to the following ```garm``` API endpoint: - - ```bash - GET /api/v1/metadata - ``` - -This URL needs to be accessible only by the instances ```garm``` sets up. This URL will not be used by anyone else. To configure it in ```garm``` add the following line in the ```[default]``` section of your ```garm``` config: - - ```toml - metadata_url = "https://garm.example.com/api/v1/metadata" - ``` - -## The debug_server option - -GARM can optionally enable the golang profiling server. This is useful if you suspect garm may be have a bottleneck in any way. To enable the profiling server, add the following section to the garm config: - -```toml -[default] - -debug_server = true -``` - -And restart garm. You can then use the following command to start profiling: - -```bash -go tool pprof http://127.0.0.1:9997/debug/pprof/profile?seconds=120 -``` - -Important note on profiling when behind a reverse proxy. The above command will hang for a fairly long time. Most reverse proxies will timeout after about 60 seconds. To avoid this, you should only profile on localhost by connecting directly to garm. - -It's also advisable to exclude the debug server URLs from your reverse proxy and only make them available locally. - -Now that the debug server is enabled, here is a blog post on how to profile golang applications: https://blog.golang.org/profiling-go-programs - - -## The log_file option - -By default, GARM logs everything to standard output. - -You can optionally log to file by adding the following to your config file: - -```toml -[default] -# Use this if you'd like to log to a file instead of standard output. -log_file = "/tmp/runner-manager.log" -``` - -### Rotating log files - -GARM automatically rotates the log if it reaches 500 MB in size or 28 days, whichever comes first. - -However, if you want to manually rotate the log file, you can send a `SIGHUP` signal to the GARM process. - -You can add the following to your systemd unit file to enable `reload`: - -```ini -[Service] -ExecReload=/bin/kill -HUP $MAINPID -``` - -Then you can simply: - -```bash -systemctl reload garm -``` - -## The enable_log_streamer option - -This option allows you to stream garm logs directly to your terminal. Set this option to true, then you can use the following command to stream logs: - -```bash -garm-cli debug-log -``` - -An important note on enabling this option when behind a reverse proxy. The log streamer uses websockets to stream logs to you. You will need to configure your reverse proxy to allow websocket connections. If you're using nginx, you will need to add the following to your nginx `server` config: - -```nginx -location /api/v1/ws { - proxy_pass http://garm_backend; - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "Upgrade"; - proxy_set_header Host $host; -} -``` \ No newline at end of file diff --git a/doc/config_jwt_auth.md b/doc/config_jwt_auth.md deleted file mode 100644 index 7f07d311..00000000 --- a/doc/config_jwt_auth.md +++ /dev/null @@ -1,18 +0,0 @@ -# The JWT authentication config section - -This section configures the JWT authentication used by the API server. GARM is currently a single user system and that user has the right to do anything and everything GARM is capable of. As a result, the JWT auth we have does not include a refresh token. The token is valid for the duration of the time to live (TTL) set in the config file. Once the token expires, you will need to log in again. - -It is recommended that the secret be a long, randomly generated string. Changing the secret at any time will invalidate all existing tokens. - -```toml -[jwt_auth] -# A JWT token secret used to sign tokens. Obviously, this needs to be changed :). -secret = ")9gk_4A6KrXz9D2u`0@MPea*sd6W`%@5MAWpWWJ3P3EqW~qB!!(Vd$FhNc*eU4vG" - -# Time to live for tokens. Both the instances and you will use JWT tokens to -# authenticate against the API. However, this TTL is applied only to tokens you -# get when logging into the API. The tokens issued to the instances we manage, -# have a TTL based on the runner bootstrap timeout set on each pool. The minimum -# TTL for this token is 24h. -time_to_live = "8760h" -``` \ No newline at end of file diff --git a/doc/config_logging.md b/doc/config_logging.md deleted file mode 100644 index ae3b93ed..00000000 --- a/doc/config_logging.md +++ /dev/null @@ -1,35 +0,0 @@ -# The logging section - -GARM has switched to the `slog` package for logging, adding structured logging. As such, we added a dedicated `logging` section to the config to tweak the logging settings. We moved the `enable_log_streamer` and the `log_file` options from the `default` section to the `logging` section. They are still available in the `default` section for backwards compatibility, but they are deprecated and will be removed in a future release. - -An example of the new `logging` section: - -```toml -[logging] -# Uncomment this line if you'd like to log to a file instead of standard output. -# log_file = "/tmp/runner-manager.log" - -# enable_log_streamer enables streaming the logs over websockets -enable_log_streamer = true -# log_format is the output format of the logs. GARM uses structured logging and can -# output as "text" or "json" -log_format = "text" -# log_level is the logging level GARM will output. Available log levels are: -# * debug -# * info -# * warn -# * error -log_level = "debug" -# log_source will output information about the function that generated the log line. -log_source = false -``` - -By default GARM logs everything to standard output. You can optionally log to file by adding the `log_file` option to the `logging` section. The `enable_log_streamer` option allows you to stream GARM logs directly to your terminal. Set this option to `true`, then you can use the following command to stream logs: - -```bash -garm-cli debug-log -``` - -The `log_format`, `log_level` and `log_source` options allow you to tweak the logging output. The `log_format` option can be set to `text` or `json`. The `log_level` option can be set to `debug`, `info`, `warn` or `error`. The `log_source` option will output information about the function that generated the log line. All these options influence how the structured logging is output. - -This will allow you to ingest GARM logs in a central location such as an ELK stack or similar. \ No newline at end of file diff --git a/doc/config_metrics.md b/doc/config_metrics.md deleted file mode 100644 index 7e5318e3..00000000 --- a/doc/config_metrics.md +++ /dev/null @@ -1,118 +0,0 @@ -# The metrics section - -This is one of the features in GARM that I really love having. For one thing, it's community contributed and for another, it really adds value to the project. It allows us to create some pretty nice visualizations of what is happening with GARM. - -## Common metrics - -| Metric name | Type | Labels | Description | -|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| -| `garm_health` | Gauge | `controller_id`=<controller id>
`callback_url`=<callback url>
`controller_webhook_url`=<controller webhook url>
`metadata_url`=<metadata url>
`webhook_url`=<webhook url>
`name`=<hostname> | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. | -| `garm_webhooks_received` | Counter | `valid`=<valid request>
`reason`=<reason for invalid requests> | This is a counter that increments every time GARM receives a webhook from GitHub. | - -## Enterprise metrics - -| Metric name | Type | Labels | Description | -|---------------------------------------|-------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------| -| `garm_enterprise_info` | Gauge | `id`=<enterprise id>
`name`=<enterprise name> | This is a gauge that is set to 1 and expose enterprise information | -| `garm_enterprise_pool_manager_status` | Gauge | `id`=<enterprise id>
`name`=<enterprise name>
`running`=<true\|false> | This is a gauge that is set to 1 if the enterprise pool manager is running and set to 0 if not | - -## Organization metrics - -| Metric name | Type | Labels | Description | -|-----------------------------------------|-------|-----------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| -| `garm_organization_info` | Gauge | `id`=<organization id>
`name`=<organization name> | This is a gauge that is set to 1 and expose organization information | -| `garm_organization_pool_manager_status` | Gauge | `id`=<organization id>
`name`=<organization name>
`running`=<true\|false> | This is a gauge that is set to 1 if the organization pool manager is running and set to 0 if not | - -## Repository metrics - -| Metric name | Type | Labels | Description | -|---------------------------------------|-------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------| -| `garm_repository_info` | Gauge | `id`=<repository id>
`name`=<repository name> | This is a gauge that is set to 1 and expose repository information | -| `garm_repository_pool_manager_status` | Gauge | `id`=<repository id>
`name`=<repository name>
`running`=<true\|false> | This is a gauge that is set to 1 if the repository pool manager is running and set to 0 if not | - -## Provider metrics - -| Metric name | Type | Labels | Description | -|----------------------|-------|-------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------| -| `garm_provider_info` | Gauge | `description`=<provider description>
`name`=<provider name>
`type`=<internal\|external> | This is a gauge that is set to 1 and expose provider information | - -## Pool metrics - -| Metric name | Type | Labels | Description | -|-------------------------------|-------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| -| `garm_pool_info` | Gauge | `flavor`=<flavor>
`id`=<pool id>
`image`=<image name>
`os_arch`=<defined OS arch>
`os_type`=<defined OS name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`prefix`=<prefix>
`provider`=<provider name>
`tags`=<concatenated list of pool tags>
| This is a gauge that is set to 1 and expose pool information | -| `garm_pool_status` | Gauge | `enabled`=<true\|false>
`id`=<pool id> | This is a gauge that is set to 1 if the pool is enabled and set to 0 if not | -| `garm_pool_bootstrap_timeout` | Gauge | `id`=<pool id> | This is a gauge that is set to the pool bootstrap timeout | -| `garm_pool_max_runners` | Gauge | `id`=<pool id> | This is a gauge that is set to the pool max runners | -| `garm_pool_min_idle_runners` | Gauge | `id`=<pool id> | This is a gauge that is set to the pool min idle runners | - -## Runner metrics - -| Metric name | Type | Labels | Description | -|--------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| -| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns | -| `garm_runner_operations_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation is performed | -| `garm_runner_errors_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation errored | - -## Github metrics - -| Metric name | Type | Labels | Description | -|--------------------------------|---------|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| -| `garm_github_operations_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation is performed | -| `garm_github_errors_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation errored | - -## Enabling metrics - -Metrics are disabled by default. To enable them, add the following to your config file: - -```toml -[metrics] - -# Toggle to disable authentication (not recommended) on the metrics endpoint. -# If you do disable authentication, I encourage you to put a reverse proxy in front -# of garm and limit which systems can access that particular endpoint. Ideally, you -# would enable some kind of authentication using the reverse proxy, if the built-in auth -# is not sufficient for your needs. -# -# Default: false -disable_auth = true - -# Toggle metrics. If set to false, the API endpoint for metrics collection will -# be disabled. -# -# Default: false -enable = true - -# period is the time interval when the /metrics endpoint will update internal metrics about -# controller specific objects (e.g. runners, pools, etc.) -# -# Default: "60s" -period = "30s" -``` - -You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it. - -## Configuring prometheus - -The following section assumes that your garm instance is running at `garm.example.com` and has TLS enabled. - -First, generate a new JWT token valid only for the metrics endpoint: - -```bash -garm-cli metrics-token create -``` - -Note: The token validity is equal to the TTL you set in the [JWT config section](/doc/config_jwt_auth.md). - -Copy the resulting token, and add it to your prometheus config file. The following is an example of how to add garm as a target in your prometheus config file: - -```yaml -scrape_configs: - - job_name: "garm" - # Connect over https. If you don't have TLS enabled, change this to http. - scheme: https - static_configs: - - targets: ["garm.example.com"] - authorization: - credentials: "superSecretTokenYouGeneratedEarlier" -``` \ No newline at end of file diff --git a/doc/database.md b/doc/database.md deleted file mode 100644 index 91c06705..00000000 --- a/doc/database.md +++ /dev/null @@ -1,20 +0,0 @@ -# Database configuration - -GARM currently supports SQLite3. Support for other stores will be added in the future. - -```toml -[database] - # Turn on/off debugging for database queries. - debug = false - # Database backend to use. Currently supported backends are: - # * sqlite3 - backend = "sqlite3" - # the passphrase option is a temporary measure by which we encrypt the webhook - # secret that gets saved to the database, using AES256. In the future, secrets - # will be saved to something like Barbican or Vault, eliminating the need for - # this. This string needs to be 32 characters in size. - passphrase = "shreotsinWadquidAitNefayctowUrph" - [database.sqlite3] - # Path on disk to the sqlite3 database file. - db_file = "/home/runner/garm.db" -``` diff --git a/doc/events.md b/doc/events.md new file mode 100644 index 00000000..6bc61a9d --- /dev/null +++ b/doc/events.md @@ -0,0 +1,254 @@ +# GARM database events + +Starting with GARM version `v0.1.5`, we now have a new websocket endpoint that allows us to subscribe to some events that are emited by the database watcher. Whenever a database entity is created, updated or deleted, the database watcher will notify all interested consumers that an event has occured and as part of that event, we get a copy of the database entity that was affected. + +For example, if a new runner is created, the watcher will emit a `Create` event for the `Instances` entity and in the `Payload` field, we will have a copy of the `Instance` entity that was created. Internally, this will be a golang struct, but when exported via the websocket endpoint, it will be a JSON object, with all sensitive info (passwords, keys, secrets in general) stripped out. + +This document will focus on the websocket endpoint and the events that are exported by it. + +# Entities and operations + +Virtually all database entities are exposed through the events endpoint. These entities are defined in the [database common package](https://github.com/cloudbase/garm/blob/56b0e6065a993fd89c74a8b4ab7de3487544e4e0/database/common/watcher.go#L12-L21). Each of the entity types represents a database table in GARM. + +Those entities are: + +* `repository` - represents a repository in the database +* `organization` - represents an organization in the database +* `enterprise` - represents an enterprise in the database +* `pool` - represents a pool in the database +* `user` - represents a user in the database. Currently GARM is not multi tenant so we just have the "admin" user +* `instance` - represents a runner instance in the database +* `job` - represents a recorded github workflow job in the database +* `controller` - represents a controller in the database. This is the GARM controller. +* `github_credentials` - represents a github credential in the database (PAT, Apps, etc). No sensitive info (token, keys, etc) is ever returned by the events endpoint. +* `github_endpoint` - represents a github endpoint in the database. This holds the github.com default endpoint and any GHES you may add. + +The operations hooked up to the events endpoint and the databse wather are: + +* `create` - emitted when a new entity is created +* `update` - emitted when an entity is updated +* `delete` - emitted when an entity is deleted + +# Event structure + +The event structure is defined in the [database common package](https://github.com/cloudbase/garm/blob/56b0e6065a993fd89c74a8b4ab7de3487544e4e0/database/common/watcher.go#L30-L34). The structure for a change payload is marshaled into a JSON object as follows: + +```json +{ + "entity-type": "repository", + "operation": "create" + "payload": [object] +} +``` + +Where the `payload` will be a JSON representation of one of the entities defined above. Essentially, you can expect to receive a JSON identical to the one you would get if you made an API call to the GARM REST API for that particular entity. + +Note that in some cases, the `delete` operation will return the full object prior to the deletion of the entity, while others will only ever return the `ID` of the entity. This will probably be changed in future releases to only return the `ID` in case of a `delete` operation, for all entities. You should operate under the assumption that in the future, delete operations will only return the `ID` of the entity. + +# Subscribing to events + +By default the events endpoint returns no events. All events are filtered by default. To start receiving events, you need to emit a message on the websocket connection indicating the entities and/or operations you're interested in. + +This gives you the option to get fine grained control over what you receive at any given point in time. Of course, you can opt to receive everything and deal with the potential deluge (depends on how busy your GARM instance is) on your own. + +## The filter message + +The filter is defined as a JSON that you write over the websocket connections. That JSON must adhere to the following schema: + +```json +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/cloudbase/garm/apiserver/events/options", + "$ref": "#/$defs/Options", + "$defs": { + "Filter": { + "properties": { + "operations": { + "items": { + "type": "string", + "enum": [ + "create", + "update", + "delete" + ] + }, + "type": "array", + "title": "operations", + "description": "A list of operations to filter on" + }, + "entity-type": { + "type": "string", + "enum": [ + "repository", + "organization", + "enterprise", + "pool", + "user", + "instance", + "job", + "controller", + "github_credentials", + "github_endpoint" + ], + "title": "entity type", + "description": "The type of entity to filter on", + "default": "repository" + } + }, + "additionalProperties": false, + "type": "object" + }, + "Options": { + "properties": { + "send-everything": { + "type": "boolean", + "title": "send everything", + "default": false + }, + "filters": { + "items": { + "$ref": "#/$defs/Filter" + }, + "type": "array", + "title": "filters", + "description": "A list of filters to apply to the events. This is ignored when send-everything is true" + } + }, + "additionalProperties": false, + "type": "object" + } + } +} +``` + +But I realize a JSON schema is not the best way to explain how to use the filter. The following examples should give you a better idea of how to use the filter. + +### Example 1: Send all events + +```json +{ + "send-everything": true +} +``` + +### Example 2: Send only `create` events for `repository` entities + +```json +{ + "send-everything": false, + "filters": [ + { + "entity-type": "repository", + "operations": ["create"] + } + ] +} +``` + +### Example 3: Send `create` and `update` for repositories and `delete` for instances + +```json +{ + "send-everything": false, + "filters": [ + { + "entity-type": "repository", + "operations": ["create", "update"] + }, + { + "entity-type": "instance", + "operations": ["delete"] + } + ] +} +``` + +## Connecting to the events endpoint + +You can use any websocket client, written in any programming language to interact with the events endpoint. In the following exmple I'll show you how to do it from go. + +Before we start, we'll need a JWT token to access the events endpoint. Normally, if you use the CLI, you should have it in your `~/.local/share/garm-cli` folder. But if you know your username and password, we can fetch a fresh one using `curl`: + +```bash +# Read the password from the terminal +read -s PASSWD + +# Get the token +curl -s -X POST -d '{"username": "admin", "password": "'$PASSWD'"}' \ + https://garm.example.com/api/v1/auth/login | jq -r .token +``` + +Save the token, we'll need it for later. + +Now, let's write a simple go program that connects to the events endpoint and subscribes to all events. We'll use the reader that was added to [`garm-provider-common`](https://github.com/cloudbase/garm-provider-common) in version `v0.1.3`, to make this easier: + +```go +package main + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + + garmWs "github.com/cloudbase/garm-provider-common/util/websocket" + "github.com/gorilla/websocket" +) + +// List of signals to interrupt the program +var signals = []os.Signal{ + os.Interrupt, + syscall.SIGTERM, +} + +// printToConsoleHandler is a simple function that prints the message to the console. +// In a real world implementation, you can use this function to decide how to properly +// handle the events. +func printToConsoleHandler(_ int, msg []byte) error { + fmt.Println(string(msg)) + return nil +} + +func main() { + // Set up the context to listen for signals. + ctx, stop := signal.NotifyContext(context.Background(), signals...) + defer stop() + + // This is the JWT token you got from the curl command above. + token := "superSecretJWTToken" + // The base URL of your GARM server + baseURL := "https://garm.example.com" + // This is the path to the events endpoint + pth := "/api/v1/ws/events" + + // Instantiate the websocket reader + reader, err := garmWs.NewReader(ctx, baseURL, pth, token, printToConsoleHandler) + if err != nil { + fmt.Println(err) + return + } + + // Start the loop. + if err := reader.Start(); err != nil { + fmt.Println(err) + return + } + + // Set the filter to receive all events. You can use a more fine grained filter if you wish. + reader.WriteMessage(websocket.TextMessage, []byte(`{"send-everything":true}`)) + + fmt.Println("Listening for events. Press Ctrl+C to stop.") + // Wait for the context to be done. + <-ctx.Done() +} +``` + +If you run this program and change something in the GARM database, you should see the event being printed to the console: + +```bash +gabriel@rossak:/tmp/ex$ go run ./main.go +{"entity-type":"pool","operation":"update","payload":{"runner_prefix":"garm","id":"8ec34c1f-b053-4a5d-80d6-40afdfb389f9","provider_name":"lxd","max_runners":10,"min_idle_runners":0,"image":"ubuntu:22.04","flavor":"default","os_type":"linux","os_arch":"amd64","tags":[{"id":"76781c93-e354-402e-907a-785caab36207","name":"self-hosted"},{"id":"2ff4a89e-e3b4-4e78-b977-6c21e83cca3d","name":"x64"},{"id":"5b3ffec6-0402-4322-b2a9-fa7f692bbc00","name":"Linux"},{"id":"e95e106d-1a3d-11ee-bd1d-00163e1f621a","name":"ubuntu"},{"id":"3b54ae6c-5e9b-4a81-8e6c-0f78a7b37b04","name":"repo"}],"enabled":true,"instances":[],"repo_id":"70227434-e7c0-4db1-8c17-e9ae3683f61e","repo_name":"gsamfira/scripts","runner_bootstrap_timeout":20,"extra_specs":{"disable_updates":true,"enable_boot_debug":true},"github-runner-group":"","priority":10}} +``` + +In the above example, you can see an `update` event on a `pool` entity. The `payload` field contains the full, updated `pool` entity. diff --git a/doc/external_provider.md b/doc/external_provider.md index 428572cd..70b0374d 100644 --- a/doc/external_provider.md +++ b/doc/external_provider.md @@ -1,31 +1,31 @@ # Writing an external provider -External provider enables you to write a fully functional provider, using any scripting or programming language. Garm will call your executable to manage the lifecycle of the instances hosting the runners. This document describes the API that an executable needs to implement to be usable by ```garm```. +External provider enables you to write a fully functional provider, using any scripting or programming language. Garm will call your executable to manage the lifecycle of the instances hosting the runners. This document describes the API that an executable needs to implement to be usable by `garm`. ## Environment variables -When ```garm``` calls your executable, a number of environment variables are set, depending on the operation. There are three environment variables that will always be set regardless of operation. Those variables are: +When `garm` calls your executable, a number of environment variables are set, depending on the operation. There are three environment variables that will always be set regardless of operation. Those variables are: -* ```GARM_COMMAND``` -* ```GARM_PROVIDER_CONFIG_FILE``` -* ```GARM_CONTROLLER_ID``` +* `GARM_COMMAND` +* `GARM_PROVIDER_CONFIG_FILE` +* `GARM_CONTROLLER_ID` The following are variables that are specific to some operations: -* ```GARM_POOL_ID``` -* ```GARM_INSTANCE_ID``` +* `GARM_POOL_ID` +* `GARM_INSTANCE_ID` ### The GARM_COMMAND variable -The ```GARM_COMMAND``` environment variable will be set to one of the operations defined in the interface. When your executable is called, you'll need to inspect this variable to know which operation you need to execute. +The `GARM_COMMAND` environment variable will be set to one of the operations defined in the interface. When your executable is called, you'll need to inspect this variable to know which operation you need to execute. ### The GARM_PROVIDER_CONFIG_FILE variable -The ```GARM_PROVIDER_CONFIG_FILE``` variable will contain a path on disk to a file that can contain whatever configuration your executable needs. For example, in the case of the [sample OpenStack external provider](../contrib/providers.d/openstack/keystonerc), this file contains variables that you would normally find in a ```keystonerc``` file, used to access an OpenStack cloud. But you can use it to add any extra configuration you need. +The `GARM_PROVIDER_CONFIG_FILE` variable will contain a path on disk to a file that can contain whatever configuration your executable needs. For example, in the case of the [OpenStack external provider](https://github.com/cloudbase/garm-provider-openstack), this file is a toml which contains provider specific configuration options. The provider author decides what this file needs to contain for the provider to function properly. -The config is opaque to ```garm``` itself. It only has meaning for your external provider. +GARM does not read this file in any way. It is simply passed to the executable via the environment variable. -In your executable, you could implement something like this: +The OpenStack provider mentioned above is written in Go, but it doesn't need to be. For example, if your provider is written in BASH, handling the config file could look something like this: ```bash if [ -f "${GARM_PROVIDER_CONFIG_FILE}" ];then @@ -54,37 +54,37 @@ esac ### The GARM_CONTROLLER_ID variable -The ```GARM_CONTROLLER_ID``` variable is set for all operations. +The `GARM_CONTROLLER_ID` variable is set for all operations. When garm first starts up, it generates a unique ID that identifies it as an instance. This ID is passed to the provider and should always be used to tag resources in whichever cloud you write your provider for. This ensures that if you have multiple garm installations, one particular deployment of garm will never touch any resources it did not create. -In most clouds you can attach ```tags``` to resources. You can use the controller ID as one of the tags during the ```CreateInstance``` operation. +In most clouds you can attach `tags` to resources. You can use the controller ID as one of the tags during the `CreateInstance` operation. ### The GARM_POOL_ID variable -The ```GARM_POOL_ID``` environment variable is a ```UUID4``` describing the pool in which a runner is created. This variable is set in two operations: +The `GARM_POOL_ID` environment variable is a `UUID4` describing the pool in which a runner is created. This variable is set in two operations: * CreateInstance * ListInstances -As with the ```GARM_CONTROLLER_ID```, this ID **must** also be attached as a tag or whichever mechanism your target cloud supports, to identify the pool to which the resources (in most cases the VMs) belong to. +As with the `GARM_CONTROLLER_ID`, this ID **must** also be attached as a tag or whichever mechanism your target cloud supports, to identify the pool to which the resources (in most cases the VMs) belong to. ### The GARM_INSTANCE_ID variable -The ```GARM_INSTANCE_ID``` environment variable is used in four operations: +The `GARM_INSTANCE_ID` environment variable is used in four operations: * GetInstance * DeleteInstance * Start * Stop -It contains the ```provider_id``` of the instance. The ```provider_id``` is a unique identifier, specific to the IaaS in which the compute resource was created. In OpenStack, it's an ```UUID4```, while in LXD, it's the virtual machine's name. +It contains the `provider_id` of the instance. The `provider_id` is a unique identifier, specific to the IaaS in which the compute resource was created. In OpenStack, it's an `UUID4`, while in LXD, it's the virtual machine's name. We need this ID whenever we need to execute an operation that targets one specific runner. ## Operations -The operations that a provider must implement are described in the ```Provider``` [interface available here](https://github.com/cloudbase/garm/blob/223477c4ddfb6b6f9079c444d2f301ef587f048b/runner/providers/external/execution/interface.go#L9-L27). The external provider implements this interface, and delegates each operation to your external executable. [These operations are](https://github.com/cloudbase/garm/blob/223477c4ddfb6b6f9079c444d2f301ef587f048b/runner/providers/external/execution/commands.go#L5-L13): +The operations that a provider must implement are described in the `Provider` [interface available here](https://github.com/cloudbase/garm/blob/223477c4ddfb6b6f9079c444d2f301ef587f048b/runner/providers/external/execution/interface.go#L9-L27). The external provider implements this interface, and delegates each operation to your external executable. [These operations are](https://github.com/cloudbase/garm/blob/223477c4ddfb6b6f9079c444d2f301ef587f048b/runner/providers/external/execution/commands.go#L5-L13): * CreateInstance * DeleteInstance @@ -96,30 +96,30 @@ The operations that a provider must implement are described in the ```Provider`` ## CreateInstance -The ```CreateInstance``` command has the most moving parts. The ideal external provider is one that will create all required resources for a fully functional instance, will start the instance. Waiting for the instance to start is not necessary. If the instance can reach the ```callback_url``` configured in ```garm```, it will update it's own status when it starts running the userdata script. +The `CreateInstance` command has the most moving parts. The ideal external provider is one that will create all required resources for a fully functional instance, will start the instance. Waiting for the instance to start is not necessary. If the instance can reach the `callback_url` configured in `garm`, it will update it's own status when it starts running the userdata script. But aside from creating resources, the ideal external provider is also idempotent, and will clean up after itself in case of failure. If for any reason the executable will fail to create the instance, any dependency that it has created up to the point of failure, should be cleaned up before returning an error code. -At the very least, it must be able to clean up those resources, if it is called with the ```DeleteInstance``` command by ```garm```. Garm will retry creating a failed instance. Before it tries again, it will attempt to run a ```DeleteInstance``` using the ```provider_id``` returned by your executable. +At the very least, it must be able to clean up those resources, if it is called with the `DeleteInstance` command by `garm`. Garm will retry creating a failed instance. Before it tries again, it will attempt to run a `DeleteInstance` using the `provider_id` returned by your executable. -If your executable failed before a ```provider_id``` could be supplied, ```garm``` will send the name of the instance as a ```GARM_INSTANCE_ID``` environment variable. +If your executable failed before a `provider_id` could be supplied, `garm` will send the name of the instance as a `GARM_INSTANCE_ID` environment variable. -Your external provider will need to be able to handle both. The instance name generated by ```garm``` will be unique, so it's fairly safe to use when deleting instances. +Your external provider will need to be able to handle both. The instance name generated by `garm` will be unique, so it's fairly safe to use when deleting instances. ### CreateInstance inputs -The ```CreateInstance``` command is the only command that needs to handle standard input. Garm will send the runner bootstrap information in stdin. The environment variables set for this command are: +The `CreateInstance` command is the only command that needs to handle standard input. Garm will send the runner bootstrap information in stdin. The environment variables set for this command are: * GARM_PROVIDER_CONFIG_FILE - Config file specific to your executable * GARM_COMMAND - the command we need to run -* GARM_CONTROLLER_ID - The unique ID of the ```garm``` installation +* GARM_CONTROLLER_ID - The unique ID of the `garm` installation * GARM_POOL_ID - The unique ID of the pool this node is a part of -The information sent in via standard input is a ```json``` serialized instance of the [BootstrapInstance structure](https://github.com/cloudbase/garm/blob/6b3ea50ca54501595e541adde106703d289bb804/params/params.go#L164-L217) +The information sent in via standard input is a `json` serialized instance of the [BootstrapInstance structure](https://github.com/cloudbase/garm/blob/6b3ea50ca54501595e541adde106703d289bb804/params/params.go#L164-L217) Here is a sample of that: - ```json +```json { "name": "garm-ny9HeeQYw2rl", "tools": [ @@ -194,20 +194,20 @@ Here is a sample of that: ], "pool_id": "9dcf590a-1192-4a9c-b3e4-e0902974c2c0" } - ``` +``` In your executable you can read in this blob, by using something like this: - ```bash +```bash # Test if the stdin file descriptor is opened if [ ! -t 0 ] then # Read in the information from standard in INPUT=$(cat -) fi - ``` +``` -Then you can easily parse it. If you're using ```bash```, you can use the amazing [jq json processor](https://stedolan.github.io/jq/). Other programming languages have suitable libraries that can handle ```json```. +Then you can easily parse it. If you're using `bash`, you can use the amazing [jq json processor](https://stedolan.github.io/jq/). Other programming languages have suitable libraries that can handle `json`. You will have to parse the bootstrap params, verify that the requested image exists, gather operating system information, CPU architecture information and using that information, you will need to select the appropriate tools for the arch/OS combination you are deploying. @@ -220,11 +220,11 @@ Examples of external providers written in Go can be found at the following locat ### CreateInstance outputs -On success, your executable is expected to print to standard output a json that can be deserialized into an ```Instance{}``` structure [defined here](https://github.com/cloudbase/garm/blob/6b3ea50ca54501595e541adde106703d289bb804/params/params.go#L90-L154). +On success, your executable is expected to print to standard output a json that can be deserialized into an `Instance{}` structure [defined here](https://github.com/cloudbase/garm/blob/6b3ea50ca54501595e541adde106703d289bb804/params/params.go#L90-L154). Not all fields are expected to be populated by the provider. The ones that should be set are: - ```json +```json { "provider_id": "88818ff3-1fca-4cb5-9b37-84bfc3511ea6", "name": "garm-ny9HeeQYw2rl", @@ -236,17 +236,17 @@ Not all fields are expected to be populated by the provider. The ones that shoul "pool_id": "41c4a43a-acee-493a-965b-cf340b2c775d", "provider_fault": "" } - ``` +``` -In case of error, ```garm``` expects at the very least to see a non-zero exit code. If possible, your executable should return as much information as possible via the above ```json```, with the ```status``` field set to ```error``` and the ```provider_fault``` set to a meaningful error message describing what has happened. That information will be visible when doing a: +In case of error, `garm` expects at the very least to see a non-zero exit code. If possible, your executable should return as much information as possible via the above `json`, with the `status` field set to `error` and the `provider_fault` set to a meaningful error message describing what has happened. That information will be visible when doing a: - ```bash +```bash garm-cli runner show - ``` +``` ## DeleteInstance -The ```DeleteInstance``` command will permanently remove an instance from the cloud provider. +The `DeleteInstance` command will permanently remove an instance from the cloud provider. The environment variables set for this command are: @@ -255,13 +255,13 @@ The environment variables set for this command are: * GARM_INSTANCE_ID * GARM_PROVIDER_CONFIG_FILE -This command is not expected to output anything. On success it should simply ```exit 0```. +This command is not expected to output anything. On success it should simply `exit 0`. If the target instance does not exist in the provider, this command is expected to be a no-op. ## GetInstance -The ```GetInstance``` command will return details about the instance, as seen by the provider. +The `GetInstance` command will return details about the instance, as seen by the provider. The environment variables set for this command are: @@ -270,13 +270,13 @@ The environment variables set for this command are: * GARM_INSTANCE_ID * GARM_PROVIDER_CONFIG_FILE -On success, this command is expected to return a valid ```json``` that can be deserialized into an ```Instance{}``` structure (see CreateInstance). If possible, IP addresses allocated to the VM should be returned in addition to the sample ```json``` printed above. +On success, this command is expected to return a valid `json` that can be deserialized into an `Instance{}` structure (see CreateInstance). If possible, IP addresses allocated to the VM should be returned in addition to the sample `json` printed above. On failure, this command is expected to return a non-zero exit code. ## ListInstances -The ```ListInstances``` command will print to standard output, a json that is deserializable into an **array** of ```Instance{}```. +The `ListInstances` command will print to standard output, a json that is deserializable into an **array** of `Instance{}`. The environment variables set for this command are: @@ -285,15 +285,15 @@ The environment variables set for this command are: * GARM_PROVIDER_CONFIG_FILE * GARM_POOL_ID -This command must list all instances that have been tagged with the value in ```GARM_POOL_ID```. +This command must list all instances that have been tagged with the value in `GARM_POOL_ID`. -On success, a ```json``` is expected on standard output. +On success, a `json` is expected on standard output. On failure, a non-zero exit code is expected. ## RemoveAllInstances -The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```. External providers should tag all resources they create with the garm controller ID. That tag can then be used to identify all resources when attempting to delete all instances. +The `RemoveAllInstances` operation will remove all resources created in a cloud that have been tagged with the `GARM_CONTROLLER_ID`. External providers should tag all resources they create with the garm controller ID. That tag can then be used to identify all resources when attempting to delete all instances. The environment variables set for this command are: @@ -309,7 +309,7 @@ Note: This command is currently not used by garm. ## Start -The ```Start``` operation will start the virtual machine in the selected cloud. +The `Start` operation will start the virtual machine in the selected cloud. The environment variables set for this command are: @@ -324,9 +324,9 @@ On failure, a non-zero exit code is expected. ## Stop -NOTE: This operation is currently not use by ```garm```, but should be implemented. +NOTE: This operation is currently not use by `garm`, but should be implemented. -The ```Stop``` operation will stop the virtual machine in the selected cloud. +The `Stop` operation will stop the virtual machine in the selected cloud. Available environment variables: diff --git a/doc/github_credentials.md b/doc/github_credentials.md index 38f92056..a1d335ff 100644 --- a/doc/github_credentials.md +++ b/doc/github_credentials.md @@ -47,7 +47,7 @@ ubuntu@garm:~/garm$ garm-cli github endpoint list GARM has the option to use both [Personal Access Tokens (PAT)](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) or a [GitHub App](https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/registering-a-github-app). -If you'll use a PAT, you'll have to grant access for the following scopes: +If you'll use a PAT (classic), you'll have to grant access for the following scopes: * ```public_repo``` - for access to a repository * ```repo``` - for access to a private repository @@ -56,6 +56,16 @@ If you'll use a PAT, you'll have to grant access for the following scopes: * ```admin:repo_hook``` - if you want to allow GARM to install webhooks on repositories (optional) * ```admin:org_hook``` - if you want to allow GARM to install webhooks on organizations (optional) +Fine grained PATs are also supported as long as you grant the required privileges: + +* **Repository permissions**: + * `Administration: Read & write` - needed to generate JIT config/registration token, remove runners, etc. + * `Metadata: Read-only` - automatically enabled by above + * `Webhooks: Read & write` - needed to install webhooks on repositories +* **Organization permissions**: + * `Self-hosted runners: Read & write` - needed to manage runners in an organization + * `Webhooks: Read & write` - needed to install webhooks on organizations + If you plan to use github apps, you'll need to select the following permissions: * **Repository permissions**: diff --git a/doc/images/garm-dark.drawio.svg b/doc/images/garm-dark.drawio.svg new file mode 100644 index 00000000..d48f2616 --- /dev/null +++ b/doc/images/garm-dark.drawio.svg @@ -0,0 +1,4 @@ + + + +
GitHub/GHES
GitHub/GHES
GARM
GARM
Incus/LXD
Incus/LXD
Providers
Providers
k8s
k8s
AWS EC2
AWS EC2
Azure
Azure
LXD/Incus
LXD/Incus
Provider creates compute instance in target infrastructure
Provider creates compute instance in target infrastructure
GCP/openstack/etc
GCP/openstack/etc
Web hook endpoint records/updates job
Web hook endpoint records/updates job
Webhook Endpoint
Webhook Endpoint
Job queue
Job queue
Pool manager consumes jobs in "queued" state
Pool manager consumes jobs in "queued" state
Entities (repos/orgs/enterprises)
Entities (repos/orgs/enterprises)
Pool Manager
Pool Manager
Pool leverages provider to create instance
Pool leverages provider to create instance
Pools
(homogeneous set of ephemeral runners)
Pools...
Instances fetch their metadata and report installation progress
Instances fetch their metadata and report installation progress
garm-runnerN
garm-runnerN
Azure
Azure
garm-runnerN
garm-runnerN
AWS EC2
AWS EC2
garm-runnerN
garm-runnerN
k8s
k8s
garm-runnerN
garm-runnerN
Entities (repos/orgs/enterprises)
Entities (repos/orgs/enterprises)
Webhook signals new Job
Webhook signals new Job
Webhooks
Webhooks
Pool manager selects appropriate pool
Pool manager selects appropriate pool
garm-runner1
garm-runner1
garm-runnerN
garm-runnerN
Runner status is updated in the GARM DB
Runner status is updated in the GARM DB
Callback URLs (metadata, status updates, etc)
Callback URLs (metadata, status updates, etc)
Self-hosted runners
Self-hosted runners
garm-runner1
garm-runner1
garm-runner3
garm-runner3
garm-runner2
garm-runner2
garm-runnerN
garm-runnerN
The GitHub runner registers itself in the target entity
The GitHub runner registers itself in the target entity
WebSocket (logs/events)
WebSocket (logs/events)
\ No newline at end of file diff --git a/doc/images/garm-light.drawio.svg b/doc/images/garm-light.drawio.svg new file mode 100644 index 00000000..754eafd9 --- /dev/null +++ b/doc/images/garm-light.drawio.svg @@ -0,0 +1,4 @@ + + + +
GitHub/GHES
GitHub/GHES
GARM
GARM
Incus/LXD
Incus/LXD
Providers
Providers
k8s
k8s
AWS EC2
AWS EC2
Azure
Azure
LXD/Incus
LXD/Incus
Provider creates compute instance in target infrastructure
Provider creates compute instance in target infrastructure
GCP/openstack/etc
GCP/openstack/etc
Web hook endpoint records/updates job
Web hook endpoint records/updates job
Webhook Endpoint
Webhook Endpoint
Job queue
Job queue
Pool manager consumes jobs in "queued" state
Pool manager consumes jobs in "queued" state
Entities (repos/orgs/enterprises)
Entities (repos/orgs/enterprises)
Pool Manager
Pool Manager
Pool leverages provider to create instance
Pool leverages provider to create instance
Pools
(homogeneous set of ephemeral runners)
Pools...
Instances fetch their metadata and report installation progress
Instances fetch their metadata and report installation progress
garm-runnerN
garm-runnerN
Azure
Azure
garm-runnerN
garm-runnerN
AWS EC2
AWS EC2
garm-runnerN
garm-runnerN
k8s
k8s
garm-runnerN
garm-runnerN
Entities (repos/orgs/enterprises)
Entities (repos/orgs/enterprises)
Webhook signals new Job
Webhook signals new Job
Webhooks
Webhooks
Pool manager selects appropriate pool
Pool manager selects appropriate pool
garm-runner1
garm-runner1
garm-runnerN
garm-runnerN
Runner status is updated in the GARM DB
Runner status is updated in the GARM DB
Callback URLs (metadata, status updates, etc)
Callback URLs (metadata, status updates, etc)
Self-hosted runners
Self-hosted runners
garm-runner1
garm-runner1
garm-runner3
garm-runner3
garm-runner2
garm-runner2
garm-runnerN
garm-runnerN
The GitHub runner registers itself in the target entity
The GitHub runner registers itself in the target entity
WebSocket (logs/events)
WebSocket (logs/events)
\ No newline at end of file diff --git a/doc/labels.md b/doc/labels.md index c3b7f82a..12daf605 100644 --- a/doc/labels.md +++ b/doc/labels.md @@ -12,4 +12,4 @@ Before version 2.305.0 of the runner and before JIT runners were introduced, the This made scheduling and using runners a bit awkward in some situations. For example, in large organizations with many teams, often times workflows would simply target the `self-hosted` label. This would match all runners regardless of any other custom labels. This had the side effect that workflows would potentially use expensive runners for simple jobs or would select low resource runners for tasks that would require a lot of resources. -Version 2.305.0 of the runner introduced the `--no-default-labels` flag when registering the runner. When JIT is not available (GHES version < 3.10), GARM will now register the runner with the `--no-default-labels` flag. If you still need the default labels, you can add them to the pool specification. +Version 2.305.0 of the runner introduced the `--no-default-labels` flag when registering the runner. When JIT is not available (GHES version < 3.10), GARM will now register the runner with the `--no-default-labels` flag. If you still need the default labels, you can still add them when creating the pool as part of the `--tags` command line option. diff --git a/doc/providers.md b/doc/providers.md deleted file mode 100644 index 1fbd71b9..00000000 --- a/doc/providers.md +++ /dev/null @@ -1,60 +0,0 @@ -# Provider configuration - -GARM was designed to be extensible. Providers can be written as external executables. External providers are executables that implement the needed interface to create/delete/list compute systems that are used by ```GARM``` to create runners. - -- [External provider](#external-provider) - - [Available external providers](#available-external-providers) - -## External provider - -The external provider is a special kind of provider. It delegates the functionality needed to create the runners to external executables. These executables can be either binaries or scripts. As long as they adhere to the needed interface, they can be used to create runners in any target IaaS. This is identical to what ```containerd``` does with ```CNIs```. - -There are currently two sample external providers available in the [contrib folder of this repository](../contrib/providers.d/). The providers are written in ```bash``` and are meant as examples of how a provider could be written in ```bash```. Production ready providers would need more error checking and idempotency, but they serve as an example of what can be done. As it stands, they are functional. - -The configuration for an external provider is quite simple: - -```toml -# This is an example external provider. External providers are executables that -# implement the needed interface to create/delete/list compute systems that are used -# by GARM to create runners. -[[provider]] -name = "openstack_external" -description = "external openstack provider" -provider_type = "external" - [provider.external] - # config file passed to the executable via GARM_PROVIDER_CONFIG_FILE environment variable - config_file = "/etc/garm/providers.d/openstack/keystonerc" - # Absolute path to an executable that implements the provider logic. This executable can be - # anything (bash, a binary, python, etc). See documentation in this repo on how to write an - # external provider. - provider_executable = "/etc/garm/providers.d/openstack/garm-external-provider" -``` - -The external provider has two options: - -* ```provider_executable``` -* ```config_file``` - -The ```provider_executable``` option is the absolute path to an executable that implements the provider logic. GARM will delegate all provider operations to this executable. This executable can be anything (bash, python, perl, go, etc). See [Writing an external provider](./external_provider.md) for more details. - -The ```config_file``` option is a path on disk to an arbitrary file, that is passed to the external executable via the environment variable ```GARM_PROVIDER_CONFIG_FILE```. This file is only relevant to the external provider. GARM itself does not read it. In the case of the sample OpenStack provider, this file contains access information for an OpenStack cloud (what you would typically find in a ```keystonerc``` file) as well as some provider specific options like whether or not to boot from volume and which tenant network to use. You can check out the [sample config file](../contrib/providers.d/openstack/keystonerc) in this repository. - -If you want to implement an external provider, you can use this file for anything you need to pass into the binary when ```GARM``` calls it to execute a particular operation. - -### Available external providers - -For non testing purposes, there are two external providers currently available: - -* [OpenStack](https://github.com/cloudbase/garm-provider-openstack) -* [Azure](https://github.com/cloudbase/garm-provider-azure) -* [Kubernetes](https://github.com/mercedes-benz/garm-provider-k8s) - Thanks to the amazing folks at @mercedes-benz for sharing their awesome provider! -* [LXD](https://github.com/cloudbase/garm-provider-lxd) -* [Incus](https://github.com/cloudbase/garm-provider-incus) -* [Equinix Metal](https://github.com/cloudbase/garm-provider-equinix) -* [Amazon EC2](https://github.com/cloudbase/garm-provider-aws) -* [Google Cloud Platform (GCP)](https://github.com/cloudbase/garm-provider-gcp) -* [Oracle Cloud Infrastructure (OCI)](https://github.com/cloudbase/garm-provider-oci) - -Details on how to install and configure them are available in their respective repositories. - -If you wrote a provider and would like to add it to the above list, feel free to open a PR. diff --git a/doc/quickstart.md b/doc/quickstart.md index d4be4865..603bd12c 100644 --- a/doc/quickstart.md +++ b/doc/quickstart.md @@ -2,24 +2,21 @@ -- [Quick start](#quick-start) - - [Create the config folder](#create-the-config-folder) - - [The config file](#the-config-file) - - [The provider section](#the-provider-section) - - [Starting the service](#starting-the-service) - - [Using Docker](#using-docker) - - [Setting up GARM as a system service](#setting-up-garm-as-a-system-service) - - [Initializing GARM](#initializing-garm) - - [Setting up the webhook](#setting-up-the-webhook) - - [Creating a GitHub endpoint Optional](#creating-a-github-endpoint-optional) - - [Adding credentials](#adding-credentials) - - [Define a repo](#define-a-repo) - - [Create a pool](#create-a-pool) + - [Create the config folder](#create-the-config-folder) + - [The config file](#the-config-file) + - [The provider section](#the-provider-section) + - [Starting the service](#starting-the-service) + - [Using Docker](#using-docker) + - [Setting up GARM as a system service](#setting-up-garm-as-a-system-service) + - [Initializing GARM](#initializing-garm) + - [Setting up the webhook](#setting-up-the-webhook) + - [Creating a GitHub endpoint Optional](#creating-a-github-endpoint-optional) + - [Adding credentials](#adding-credentials) + - [Define a repo](#define-a-repo) + - [Create a pool](#create-a-pool) -This doc will be updated at a future date with the exact permissions needed in case you want to use a fine grained PAT. - ## Create the config folder All of our config files and data will be stored in `/etc/garm`. Let's create that folder: @@ -28,11 +25,11 @@ All of our config files and data will be stored in `/etc/garm`. Let's create tha sudo mkdir -p /etc/garm ``` -Coincidentally, this is also where the docker container [looks for the config](../Dockerfile#L29) when it starts up. You can either use `Docker` or you can set up garm directly on your system. I'll show you both ways. In both cases, we need to first create the config folder and a proper config file. +Coincidentally, this is also where the docker container [looks for the config](/Dockerfile#L29) when it starts up. You can either use `Docker` or you can set up garm directly on your system. We'll walk you through both options. In both cases, we need to first create the config folder and a proper config file. ## The config file -There is a full config file, with detailed comments for each option, in the [testdata folder](../testdata/config.toml). You can use that as a reference. But for the purposes of this guide, we'll be using a minimal config file and add things on as we proceed. +There is a full config file, with detailed comments for each option, in the [testdata folder](/testdata/config.toml). You can use that as a reference. But for the purposes of this guide, we'll be using a minimal config file and add things on as we proceed. Open `/etc/garm/config.toml` in your favorite editor and paste the following: @@ -73,7 +70,7 @@ time_to_live = "8760h" db_file = "/etc/garm/garm.db" ``` -This is a minimal config, with no providers defined. In this example we have the [default](./config_default.md), [logging](./config_logging.md), [metrics](./config_metrics.md), [jwt_auth](./config_jwt_auth.md), [apiserver](./config_api_server.md) and [database](./database.md) sections. Each are documented separately. Feel free to read through the available docs if, for example you need to enable TLS without using an nginx reverse proxy or if you want to enable the debug server, the log streamer or a log file. +This is a minimal config, with no providers defined. In this example we have the [default](/doc/config.md#the-default-config-section), [logging](/doc/config.md#the-logging-section), [metrics](/doc/config.md#the-metrics-section), [jwt_auth](/doc/config.md#the-jwt-authentication-config-section), [apiserver](/doc/config.md#the-api-server-config-section) and [database](/doc/config.md#database-configuration) sections. Each are documented separately. Feel free to read through the available docs if, for example you need to enable TLS without using an nginx reverse proxy or if you want to enable the debug server, the log streamer or a log file. In this sample config we: @@ -100,15 +97,13 @@ This is where you have a decision to make. GARM has a number of providers you ca * [Google Cloud Platform (GCP)](https://github.com/cloudbase/garm-provider-gcp) * [Oracle Cloud Infrastructure (OCI)](https://github.com/cloudbase/garm-provider-oci) -All currently available providers are `external`. - The easiest provider to set up is probably the LXD or Incus provider. Incus is a fork of LXD so the functionality is identical (for now). For the purpose of this document, we'll continue with LXD. You don't need an account on an external cloud. You can just use your machine. You will need to have LXD installed and configured. There is an excellent [getting started guide](https://documentation.ubuntu.com/lxd/en/latest/getting_started/) for LXD. Follow the instructions there to install and configure LXD, then come back here. Once you have LXD installed and configured, you can add the provider section to your config file. If you're connecting to the `local` LXD installation, the [config snippet for the LXD provider](https://github.com/cloudbase/garm-provider-lxd/blob/4ee4e6fc579da4a292f40e0f7deca1e396e223d0/testdata/garm-provider-lxd.toml) will work out of the box. We'll be connecting using the unix socket so no further configuration will be needed. -Go ahead and create a new config somwhere where GARM can access it and paste that entire snippet. For the purposes of this doc, we'll assume you created a new file called `/etc/garm/garm-provider-lxd.toml`. That config file will be used by the provider itself. Remember, the providers are external executables that are called by GARM. They may have their own configs. +Go ahead and create a new config in a location where GARM can access it and paste that entire snippet. For the purposes of this doc, we'll assume you created a new file called `/etc/garm/garm-provider-lxd.toml`. That config file will be used by the provider itself. Remember, the providers are external executables that are called by GARM. They have their own configs which are relevant only to those executables, not GARM itself. We now need to define the provider in the GARM config file and tell GARM how it can find both the provider binary and the provider specific config file. To do that, open the GARM config file `/etc/garm/config.toml` in your favorite editor and paste the following config snippet at the end: @@ -141,7 +136,7 @@ docker run -d \ ghcr.io/cloudbase/garm:v0.1.4 ``` -You will notice we also mounted the LXD unix socket from the host inside the container where the config you pasted expects to find it. If you plan to use an external provider that does not need to connect to LXD over a unix socket, feel free to remove that mount. +You will notice that we also mounted the LXD unix socket from the host inside the container where the config you pasted expects to find it. If you plan to use an external provider that does not need to connect to LXD over a unix socket, feel free to remove that mount. Check the logs to make sure everything is working as expected: @@ -171,7 +166,7 @@ Adding the `garm` user to the LXD group will allow it to connect to the LXD unix Next, download the latest release from the [releases page](https://github.com/cloudbase/garm/releases). ```bash -wget -q -O - https://github.com/cloudbase/garm/releases/download/v0.1.4/garm-linux-amd64.tgz | tar xzf - -C /usr/local/bin/ +wget -q -O - https://github.com/cloudbase/garm/releases/download/v0.1.5/garm-linux-amd64.tgz | tar xzf - -C /usr/local/bin/ ``` We'll be running under an unprivileged user. If we want to be able to listen on any port under `1024`, we'll have to set some capabilities on the binary: @@ -204,7 +199,7 @@ Copy the sample `systemd` service file: ```bash wget -O /etc/systemd/system/garm.service \ - https://raw.githubusercontent.com/cloudbase/garm/v0.1.4/contrib/garm.service + https://raw.githubusercontent.com/cloudbase/garm/v0.1.5/contrib/garm.service ``` Reload the `systemd` daemon and start the service: @@ -228,19 +223,18 @@ signal.NotifyContext(context.Background, [interrupt terminated]) 2023/07/17 22:21:33 Loading provider lxd_local 2023/07/17 22:21:33 registering prometheus metrics collectors 2023/07/17 22:21:33 setting up metric routes -2023/07/17 22:21:35 ignoring unknown event ``` Excellent! We have a working GARM installation. Now we need to initialize the controller and set up the webhook in GitHub. ## Initializing GARM -Before we can start using GARM, we need initialize it. This will create the `admin` user and generate a unique controller ID that will identify this GARM installation. This process allows us to use multiple GARM installations with the same GitHub account. GARM will use the controller ID to identify the runners it creates. This way we won't run the risk of accidentally removing runners we don't manage. +Before we can start using GARM, we need initialize it. This will create the `admin` user and generate a unique controller ID that will identify this GARM installation. This process allows us to use multiple GARM installations with the same GitHub account, if we want or need to. GARM will use the controller ID to identify the runners it creates. This way we won't run the risk of accidentally removing runners we don't manage. To initialize GARM, we'll use the `garm-cli` tool. You can download the latest release from the [releases page](https://github.com/cloudbase/garm/releases): ```bash -wget -q -O - https://github.com/cloudbase/garm/releases/download/v0.1.4/garm-cli-linux-amd64.tgz | tar xzf - -C /usr/local/bin/ +wget -q -O - https://github.com/cloudbase/garm/releases/download/v0.1.5/garm-cli-linux-amd64.tgz | tar xzf - -C /usr/local/bin/ ``` Now we can initialize GARM: @@ -300,9 +294,9 @@ garm-cli profile switch another_garm ## Setting up the webhook -There are two options when it comes to setting up the webhook in GitHub. You can manually set up the webhook in the GitHub UI, and then use the resulting secret when creating the entity (repo, org, enterprise), or you can let GARM do it automatically if the app or PAT you're using has the [required privileges](./github_credentials.md). +There are two options when it comes to setting up the webhook in GitHub. You can manually set up the webhook in the GitHub UI, and then use the resulting secret when creating the entity (repo, org, enterprise), or you can let GARM do it automatically if the app or PAT you're using has the [required privileges](/doc/github_credentials.md). -If you want to manually set up the webhooks, have a look at the [webhooks doc](./webhooks.md) for more information. +If you want to manually set up the webhooks, have a look at the [webhooks doc](/doc/webhooks.md) for more information. In this guide, I'll show you how to do it automatically when adding a new repo, assuming you have the required privileges. Note, you'll still have to manually set up webhooks if you want to use GARM at the enterprise level. Automatic webhook management is only available for repos and orgs. @@ -339,7 +333,7 @@ In this exampe, we add a new github endpoint called `example`. The `ca-cert-path Before we can add a new entity, we need github credentials to interact with that entity (manipulate runners, create webhooks, etc). Credentials are tied to a specific github endpoint. In this section we'll be adding credentials that are valid for either [github.com](https://github.com) or your own GHES server (if you added one in the previous section). -When creating a new entity (repo, org, enterprise) using the credentials you define here, GARM will automatically associate that entity with the gitHub endpoint the credentials use. +When creating a new entity (repo, org, enterprise) using the credentials you define here, GARM will automatically associate that entity with the github endpoint that the credentials use. If you want to swap the credentials for an entity, the new credentials will need to be associated with the same endpoint as the old credentials. @@ -431,7 +425,7 @@ gabriel@rock:~$ garm-cli repo ls +--------------------------------------+----------+--------------+------------------+--------------------+------------------+ | ID | OWNER | NAME | CREDENTIALS NAME | POOL BALANCER TYPE | POOL MGR RUNNING | +--------------------------------------+----------+--------------+------------------+--------------------+------------------+ -| 0c91d9fd-2417-45d4-883c-05daeeaa8272 | gsamfira | scripts | gabriel | pack | true | +| 0c91d9fd-2417-45d4-883c-05daeeaa8272 | gsamfira | scripts | gabriel | roundrobin | true | +--------------------------------------+----------+--------------+------------------+--------------------+------------------+ ``` @@ -516,7 +510,7 @@ gabriel@rock:~$ garm-cli pool ls -a +--------------------------------------+---------------------------+--------------+-----------------+------------------+-------+---------+---------------+----------+ ``` -This pool is enabled, but the `min-idle-runners` option is set to 0. This means that it will not create any lingering runners. It will only create runners when a job is started. If your provider is slow to boot up new instances, you may want to set this to a value higher than 0. +This pool is enabled, but the `min-idle-runners` option is set to 0. This means that it will not create any idle runners. It will only create runners when a job is started and a webhook is sent to our GARM server. Optionally, you can set `min-idle-runners` to a value greater than 0, but keep in mind that depending on the provider you use, this may incur cost. For the purposes of this guide, we'll increase it to 1 so we have a runner created. @@ -582,7 +576,7 @@ gabriel@rossak:~$ lxc list If we wait for a bit and run: ```bash -gabriel@rossak:~$ garm-cli runner show garm-tdtD6zpsXhj1 +gabriel@rossak:~$ garm-cli runner show garm-tdtD6zpsXhj1 +-----------------+------------------------------------------------------------------------------------------------------+ | FIELD | VALUE | +-----------------+------------------------------------------------------------------------------------------------------+ @@ -626,4 +620,6 @@ gabriel@rossak:~$ garm-cli job ls There are no jobs sent yet to my GARM install, but once you start sending jobs, you'll see them here as well. -That's it! You now have a working GARM installation. You can add more repos, orgs or enterprises and create more pools. You can also add more providers for different clouds and credentials with access to different GitHub resources. +That's it! Now you have a working GARM installation. You can add more repos, orgs or enterprises and create more pools. You can also add more providers for different clouds and credentials with access to different GitHub resources. + +Check out the [Using GARM](/doc/using_garm.md) guide for more details on how to use GARM. diff --git a/doc/using_garm.md b/doc/using_garm.md index 6f386c00..ba8cf2d6 100644 --- a/doc/using_garm.md +++ b/doc/using_garm.md @@ -3,7 +3,6 @@ This document will walk you through the various commands and options available in GARM. It is assumed that you have already installed GARM and have it running. If you haven't, please check out the [quickstart](/doc/quickstart.md) document for instructions on how to install GARM. While using the GARM cli, you will most likely spend most of your time listing pools and runners, but we will cover most of the available commands and options. Some of them we'll skip (like the `init` or `profile` subcommands), as they've been covered in the [quickstart](/doc/quickstart.md) document. - - [Using GARM](#using-garm) @@ -42,13 +41,14 @@ While using the GARM cli, you will most likely spend most of your time listing p - [Showing runner info](#showing-runner-info) - [Deleting a runner](#deleting-a-runner) - [The debug-log command](#the-debug-log-command) + - [The debug-events command](#the-debug-events-command) - [Listing recorded jobs](#listing-recorded-jobs) ## Controller operations -The `controller` is essentially GARM itself. Every deployment of GARM will have its own controller ID which will be used to tag runners in github. The controller is responsible for managing runners, webhooks, repositories, organizations and enterprises. There are a few settings at the controller level which you can tweak and we will cover them below. +The `controller` is essentially GARM itself. Every deployment of GARM will have its own controller ID which will be used to tag runners in github. The controller is responsible for managing runners, webhooks, repositories, organizations and enterprises. There are a few settings at the controller level which you can tweak, which we will cover below. ### Listing controller info @@ -56,16 +56,18 @@ You can list the controller info by running the following command: ```bash garm-cli controller show -+------------------------+----------------------------------------------------------------------------+ -| FIELD | VALUE | -+------------------------+----------------------------------------------------------------------------+ -| Controller ID | a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | -| Hostname | garm | -| Metadata URL | https://garm.example.com/api/v1/metadata | -| Callback URL | https://garm.example.com/api/v1/callbacks | -| Webhook Base URL | https://garm.example.com/webhooks | -| Controller Webhook URL | https://garm.example.com/webhooks/a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | -+------------------------+----------------------------------------------------------------------------+ ++-------------------------+----------------------------------------------------------------------------+ +| FIELD | VALUE | ++-------------------------+----------------------------------------------------------------------------+ +| Controller ID | a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | +| Hostname | garm | +| Metadata URL | https://garm.example.com/api/v1/metadata | +| Callback URL | https://garm.example.com/api/v1/callbacks | +| Webhook Base URL | https://garm.example.com/webhooks | +| Controller Webhook URL | https://garm.example.com/webhooks/a4dd5f41-8e1e-42a7-af53-c0ba5ff6b0b3 | +| Minimum Job Age Backoff | 30 | +| Version | v0.1.5 | ++-------------------------+----------------------------------------------------------------------------+ ``` There are several things of interest in this output. @@ -76,12 +78,14 @@ There are several things of interest in this output. * `Callback URL` - This URL is configured by the user, and is the URL that is presented to the runners via userdata when they get set up. Runners will connect to this URL and send status updates and system information (OS version, OS name, github runner agent ID, etc) to the controller. Runners must be able to connect to this URL. * `Webhook Base URL` - This is the base URL for webhooks. It is configured by the user in the GARM config file. This URL can be called into by GitHub itself when hooks get triggered by a workflow. GARM needs to know when a new job is started in order to schedule the creation of a new runner. Job webhooks sent to this URL will be recorded by GARM and acted upon. While you can configure this URL directly in your GitHub repo settings, it is advised to use the `Controller Webhook URL` instead, as it is unique to each controller, and allows you to potentially install multiple GARM controller inside the same repo. Github must be able to connect to this URL. * `Controller Webhook URL` - This is the URL that GitHub will call into when a webhook is triggered. This URL is unique to each GARM controller and is the preferred URL to use in order to receive webhooks from GitHub. It serves the same purpose as the `Webhook Base URL`, but is unique to each controller, allowing you to potentially install multiple GARM controllers inside the same repo. Github must be able to connect to this URL. +* `Minimum Job Age Backoff` - This is the job age in seconds, after which GARM will consider spinning up a new runner to handle it. By default GARM waits for 30 seconds after receiving a new job, before it spins up a runner. This delay is there to allow any existing idle runners (managed by GARM or not) to pick up the job, before reacting to it. This way we avoid being too eager and spin up a runner for a job that would have been picked up by an existing runner anyway. You can set this to 0 if you want GARM to react immediately. +* `Version` - This is the version of GARM that is running. We will see the `Controller Webhook URL` later when we set up the GitHub repo to send webhooks to GARM. ### Updating controller settings -Like we've mentioned before, there are 3 URLs that are very important for normal operations: +As we've mentioned before, there are 3 URLs that are very important for normal operations: * `metadata_url` - Must be reachable by runners * `callback_url` - Must be reachable by runners @@ -112,7 +116,7 @@ GARM uses providers to create runners. These providers are external executables ### Listing configured providers -Once configured (see [provider configuration](/doc/providers.md)), you can list the configured providers by running the following command: +Once configured (see [provider configuration](/doc/config.md#providers)), you can list the configured providers by running the following command: ```bash ubuntu@garm:~$ garm-cli provider list @@ -141,7 +145,7 @@ Each of these providers can be used to set up a runner pool for a repository, or GARM can be used to manage runners for repos, orgs and enterprises hosted on `github.com` or on a GitHub Enterprise Server. -Endpoints are the way that GARM identifies where the credentials and entities you create, are located and where the API endpoints for the GitHub API can be reached, along with a possible CA certificate that validates the connection. There is a default endpoint for `github.com`, so you don't need to add it. But if you're using GHES, you'll need to add an endpoint for it. +Endpoints are the way that GARM identifies where the credentials and entities you create are located and where the API endpoints for the GitHub API can be reached, along with a possible CA certificate that validates the connection. There is a default endpoint for `github.com`, so you don't need to add it, unless you're using GHES. ### Creating a GitHub Endpoint @@ -214,7 +218,7 @@ garm-cli github endpoint show github.com ### Deleting a GitHub Endpoint -You can delete an endpoint unless one of the following conditions is met: +You can delete an endpoint unless any of the following conditions are met: * The endpoint is the default endpoint for `github.com` * The endpoint is in use by a repository, organization or enterprise @@ -237,7 +241,7 @@ There are two types of credentials: * PAT - Personal Access Token * App - GitHub App -To add each of these types of credentials requires slightly different command line arguments (obviously). I'm going to give you an example of both. +To add each of these types of credentials, slightly different command line arguments (obviously) are required. I'm going to give you an example of both. To add a PAT, you can run the following command: @@ -314,7 +318,7 @@ To delete a credential, you can run the following command: garm-cli github credentials delete 2 ``` -Note, you may not delete credentials that are currently associated with a repository, organization or enterprise. You will need to first replace the credentials on the entity, and then you can delete the credentials. +> **NOTE**: You may not delete credentials that are currently associated with a repository, organization or enterprise. You will need to first replace the credentials on the entity, and then you can delete the credentials. ## Repositories @@ -377,7 +381,7 @@ garm-cli repository delete be3a0673-56af-4395-9ebf-4521fea67567 This will remove the repository from GARM, and if a webhook was installed, will also clean up the webhook from the repository. -Note: GARM will not remove a webhook that points to the `Base Webhook URL`. It will only remove webhooks that are namespaced to the running controller. +> **NOTE**: GARM will not remove a webhook that points to the `Base Webhook URL`. It will only remove webhooks that are namespaced to the running controller. ## Organizations @@ -403,9 +407,9 @@ ubuntu@garm:~$ garm-cli organization add \ This will add the organization `gsamfira` to GARM, and install a webhook for it. The webhook will be validated against the secret that was generated. The only difference between adding an organization and adding a repository is that you use the `organization` subcommand instead of the `repository` subcommand, and the `--name` option represents the `name` of the organization. -Managing webhooks for organizations is similar to managing webhooks for repositories. You can list, show, install and uninstall webhooks for organizations using the `garm-cli organization webhook` subcommand. We won't go into details here, as it's similar to managing webhooks for repositories. +Managing webhooks for organizations is similar to managing webhooks for repositories. You can *list*, *show*, *install* and *uninstall* webhooks for organizations using the `garm-cli organization webhook` subcommand. We won't go into details here, as it's similar to managing webhooks for repositories. -All the other operations that exist on repositories, like listing, removing, etc, also exist for organizations and enterprises. Have a look at the help for the `garm-cli organization` subcommand for more details. +All the other operations that exist on repositories, like listing, removing, etc, also exist for organizations and enterprises. Check out the help for the `garm-cli organization` subcommand for more details. ## Enterprises @@ -493,11 +497,11 @@ To manually add a webhook, see the [webhooks](/doc/webhooks.md) section. Now that we have a repository, organization or enterprise added to GARM, we can create a runner pool for it. A runner pool is a collection of runners of the same type, that are managed by GARM and are used to run workflows for the repository, organization or enterprise. -You can create multiple pools of runners for the same entity (repository, organization or enterprise), and you can create pools of runners of different types. For example, you can have a pool of runners that are created on AWS, and another pool of runners that are created on Azure, k8s, LXD, etc. For repositories or organizations with complex needs, you can set up a number of pools that cover a wide range of needs, based on cost, capability (GPUs, FPGAs, etc) or sheer raw computing power. You don't have to pick just one and managing all of them is done using the exact same commands, as we'll show below. +You can create multiple pools of runners for the same entity (repository, organization or enterprise), and you can create multiple pools of runners, each pool defining different runner types. For example, you can have a pool of runners that are created on AWS, and another pool of runners that are created on Azure, k8s, LXD, etc. For repositories or organizations with complex needs, you can set up a number of pools that cover a wide range of needs, based on cost, capability (GPUs, FPGAs, etc) or sheer raw computing power. You don't have to pick just one, especially since managing all of them is done using the exact same commands, as we'll show below. -Before we create a pool, we have to decide on which provider we want to use. We've listed the providers above, so let's pick one and create a pool of runners for our repository. For the purpose of this example, we'll use the `incus` provider. We'll show you how to create a pool using this provider, but keep in mind that adding another pool using a different provider is done using the exact same commands. The only difference will be in the `--image`, `--flavor` and `--extra-specs` options that you'll use when creating the pool. +Before we create a pool, we have to decide which provider we want to use. We've listed the providers above, so let's pick one and create a pool of runners for our repository. For the purpose of this example, we'll use the `incus` provider. We'll show you how to create a pool using this provider, but keep in mind that adding another pool using a different provider is done using the exact same commands. The only difference will be in the `--image`, `--flavor` and `--extra-specs` options that you'll use when creating the pool. -Out of those three options, only the `--image` and `--flavor` are mandatory. The `--extra-specs` option is optional and is used to pass additional information to the provider when creating the pool. The `--extra-specs` option is provider specific, and you'll have to consult the provider documentation to see what options are available. +Out of those three options, only the `--image` and `--flavor` are mandatory. The `--extra-specs` flag is optional and is used to pass additional information to the provider when creating the pool. The `--extra-specs` option is provider specific, and you'll have to consult the provider documentation to see what options are available. But I digress. Let's create a pool of runners using the `incus` provider, for the `gabriel-samfira/garm` repository we created above: @@ -532,7 +536,7 @@ garm-cli pool add \ +--------------------------+----------------------------------------+ ``` -Let's unpack the command a bit and explain what happened above. We added a new pool of runners to GARM, that belongs to the `gabriel-samfira/garm` repository. We used the `incus` provider to create the pool, and we specified the `--image` and `--flavor` options to tell the provider what kind of runners we want to create. On Incus and LXD, the flavor maps to a `profile` and the image maps to an incus or LXD image, as you would normally use when spinning up a new container or VM using the `incus launch` command. +Let's unpack the command and explain what happened above. We added a new pool of runners to GARM, that belongs to the `gabriel-samfira/garm` repository. We used the `incus` provider to create the pool, and we specified the `--image` and `--flavor` options to tell the provider what kind of runners we want to create. On Incus and LXD, the flavor maps to a `profile`. The profile can specify the resources allocated to a container or VM (RAM, CPUs, disk space, etc). The image maps to an incus or LXD image, as you would normally use when spinning up a new container or VM using the `incus launch` command. We also specified the `--min-idle-runners` option to tell GARM to always keep at least 1 runner idle in the pool. This is useful for repositories that have a lot of workflows that run often, and we want to make sure that we always have a runner ready to pick up a job. @@ -692,7 +696,7 @@ root@incus:~# incus list +-------------------+---------+----------------------+-----------------------------------------------+-----------+-----------+ ``` -Awesome! This runner will be able to pick up bobs that match the labels we've set on the pool. +Awesome! This runner will be able to pick up jobs that match the labels we've set on the pool. ## Runners @@ -781,6 +785,24 @@ time=2024-02-12T08:36:31.251Z level=INFO msg=access_log method=GET uri=/api/v1/i This will bring a real-time log to your terminal. While this feature should be fairly secure, I encourage you to only expose it within networks you know are secure. This can be done by configuring a reverse proxy in front of GARM that only allows connections to the websocket endpoint from certain locations. +## The debug-events command + +Starting with GARM v0.1.5 a new command has been added to the CLI that consumes database events recorded by GARM. Whenever something is updated in the database, a new event is generated. These events are generated by the database watcher and are also exported via a websocket endpoint. This websocket endpoint is meant to be consumed by applications that wish to integrate GARM and want to avoid having to poll the API. + +This command is not meant to be used to integrate GARM events, it is mearly a debug tool that allows you to see what events are being generated by GARM. To use it, you can run: + +```bash +garm-cli debug-events --filters='{"send-everything": true}' +``` + +This command will send all events to your CLI as they happen. You can also filter by entity or operation like so: + +```bash +garm-cli debug-events --filters='{"filters": [{"entity-type": "instance", "operations": ["create", "delete"]}, {"entity-type": "pool"}, {"entity-type": "controller"}]}' +``` + +The payloads that get sent to your terminal are described in the [events](/doc/events.md) section, but the short description is that you get the operation type (create, update, delete), the entity type (instance, pool, repo, etc) and the json payload as you normaly would when you fetch them through the API. Sensitive info like tokens or passwords are never returned. + ## Listing recorded jobs GARM will record any job that comes in and for which we have a pool configured. If we don't have a pool for a particular job, then that job is ignored. There is no point in recording jobs that we can't do anything about. It would just bloat the database for no reason.