From aadc60601cac119598c97dc210bcee5868e83754 Mon Sep 17 00:00:00 2001 From: Callum Dunster Date: Tue, 17 Sep 2024 11:27:08 +0200 Subject: [PATCH] Add self-hosted runner and manual execution of performance tests (#88) * chore(ci): add performance tests workflow with manual trigger * chore(ci): set the tests reporter to be influx-file * chore(ci): update duration of performance tests to 120 seconds * chore(ci): add extra-args option for scenarios that need it * chore(ci): add printing of telegraf errors and warnings * chore(ci): get Run ID from logs and set as output and summary * chore(ci): add step to upload logs as artifacts * chore(ci): add link to logs in job summary * chore(ci): increase log level of TryCP tests to info * chore(ci): add script to flake to call telegraf as expected in the CI This is so that we use the telegraf version from the lockfile. * docs(ci): remove out-dated TODO * chore(ci): split the app_install scenario in two This tests installing the minimally-small and large hApps separately. --- .github/workflows/performance.yaml | 152 ++++++++++++++++++++ .github/workflows/test.yaml | 1 - flake.nix | 5 + telegraf/runner-telegraf.conf | 216 +++++++++++++++++++++++++++++ 4 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/performance.yaml create mode 100644 telegraf/runner-telegraf.conf diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml new file mode 100644 index 00000000..d9197d3e --- /dev/null +++ b/.github/workflows/performance.yaml @@ -0,0 +1,152 @@ +name: "Performance Tests" + +on: + workflow_dispatch: + +env: + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} + WT_METRICS_DIR: "${{ github.workspace }}/telegraf/metrics" + +jobs: + local-test: + runs-on: [self-hosted, wind-tunnel] + strategy: + fail-fast: false + matrix: + # To run a local test with default configuration, add the scenario name to this array. + scenario: [ zome_call_single_value, single_write_many_read, write_read, write_query, local_signals, write_validated ] + # To run a local test with additional configuration, add the scenario name and `extra-args` as an `include` item. + include: + - scenario: dht_sync_lag + extra-args: "--agents 2 --behaviour write:1 --behaviour record_lag:1" + + # Test how long it takes to install a minimally-small hApp. + - scenario: app_install + extra-args: "--behaviour minimal:1" + + # Test how long it takes to install a large hApp. + - scenario: app_install + extra-args: "--behaviour large:1" + + - scenario: first_call + extra-args: "--agents 1 --behaviour local:1" + steps: + - uses: actions/checkout@v4 + + - name: Run - ${{ matrix.scenario }} + id: run_test + run: | + # Start a sandbox conductor and run it in the background + nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" + + RUST_LOG=info nix run .#${{ matrix.scenario }} -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file ${{ matrix.extra-args }} > >(tee logs/scenario-stdout.log) 2> >(tee logs/scenario-stderr.log >&2) + + pkill hc && pkill holochain && pkill lair-keystore + + RUN_ID=$(grep -m1 "#RunId" logs/scenario-stdout.log | sed 's/#RunId: \[\(.\+\)\]/\1/') + echo "RUN_ID=$RUN_ID" >> "$GITHUB_OUTPUT" + echo "# Run ID: $RUN_ID" >> $GITHUB_STEP_SUMMARY + + - name: Run Telegraf to upload influx metrics + run: | + if ! nix run .#ci-telegraf + then + echo "::group::Telegraf errors" + status=1 + # Print errors as such in GitHub logs. + grep "E!" logs/telegraf-stderr.log | xargs -l echo "::error ::" + echo "::endgroup::" + fi + + echo "::group::Telegraf warnings" + # Print warnings as such in GitHub logs. + grep "W!" logs/telegraf-stderr.log | xargs -l echo "::warning ::" + echo "::endgroup::" + + exit ${status-0} + + - name: Upload logs as artifacts + if: success() || failure() + id: upload-artifact + uses: actions/upload-artifact@v4 + with: + name: "logs_${{ matrix.scenario }}_${{ steps.run_test.outputs.RUN_ID }}" + path: | + logs/scenario-stdout.log + logs/scenario-stderr.log + logs/telegraf-stdout.log + logs/telegraf-stderr.log + + - name: Output Path to logs in summary + run: | + echo "# Logs: [${{ steps.upload-artifact.outputs.artifact-id }}](${{ steps.upload-artifact.outputs.artifact-url }})" >> $GITHUB_STEP_SUMMARY + + trycp-test: + runs-on: [self-hosted, wind-tunnel] + strategy: + fail-fast: false + matrix: + # To run a test with TryCP and default configuration, add the scenario name to this array. + scenario: [ trycp_write_validated, remote_call_rate, validation_receipts ] + # To run a test with TryCP and additional configuration, add the scenario name and `extra-args` as an `include` item. + include: + - scenario: two_party_countersigning + extra-args: "--behaviour initiate:1 --behaviour participate:1" + steps: + - uses: actions/checkout@v4 + + - name: Run - ${{ matrix.scenario }} + id: run_test + run: | + set -x + + # Start local network services + nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &" + # Start a TryCP instance + nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &" + + RUST_LOG=info CONDUCTOR_CONFIG="CI" TRYCP_RUST_LOG="info" MIN_PEERS=2 nix run .#${{ matrix.scenario }} -- --targets targets-ci.yaml --instances-per-target 2 --duration 120 --no-progress --reporter influx-file ${{ matrix.extra-args }} > >(tee logs/scenario-stdout.log) 2> >(tee logs/scenario-stderr.log >&2) + + # Stop the TryCP instance + nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp" + # Stop local network services + pkill hc-run-local + + RUN_ID=$(grep -m1 "#RunId" logs/scenario-stdout.log | sed 's/#RunId: \[\(.\+\)\]/\1/') + echo "RUN_ID=$RUN_ID" >> "$GITHUB_OUTPUT" + echo "# Run ID: $RUN_ID" >> $GITHUB_STEP_SUMMARY + + - name: Run Telegraf to upload influx metrics + run: | + if ! nix run .#ci-telegraf + then + echo "::group::Telegraf errors" + status=1 + # Print errors as such in GitHub logs. + grep "E!" logs/telegraf-stderr.log | xargs -l echo "::error ::" + echo "::endgroup::" + fi + + echo "::group::Telegraf warnings" + # Print warnings as such in GitHub logs. + grep "W!" logs/telegraf-stderr.log | xargs -l echo "::warning ::" + echo "::endgroup::" + + exit ${status-0} + + - name: Upload logs as artifacts + if: success() || failure() + id: upload-artifact + uses: actions/upload-artifact@v4 + with: + name: "logs_${{ matrix.scenario }}_${{ steps.run_test.outputs.RUN_ID }}" + path: | + logs/scenario-stdout.log + logs/scenario-stderr.log + logs/telegraf-stdout.log + logs/telegraf-stderr.log + logs/${{ steps.run_test.outputs.RUN_ID }}/ + + - name: Output Path to logs in summary + run: | + echo "# Logs: [${{ steps.upload-artifact.outputs.artifact-id }}](${{ steps.upload-artifact.outputs.artifact-url }})" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9c804247..3ee858da 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -72,7 +72,6 @@ jobs: # Start a sandbox conductor and run it in the background nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - # TODO using `localhost` is resolving to an IPv6 address, but why is that giving a connection refused? # Run the scenario for 5 seconds RUST_LOG=info nix run .#zome_call_single_value -- --connection-string ws://localhost:8888 --duration 5 --no-progress diff --git a/flake.nix b/flake.nix index d0d79503..ea139cff 100644 --- a/flake.nix +++ b/flake.nix @@ -111,6 +111,11 @@ packages = { default = config.workspace.workspace; inherit (config.workspace) workspace; + ci-telegraf = pkgs.writeShellApplication { + name = "ci-telegraf"; + runtimeInputs = [ pkgs.telegraf ]; + text = "telegraf --config telegraf/runner-telegraf.conf --once > >(tee logs/telegraf-stdout.log) 2> >(tee logs/telegraf-stderr.log >&2)"; + }; }; checks = { diff --git a/telegraf/runner-telegraf.conf b/telegraf/runner-telegraf.conf new file mode 100644 index 00000000..7b6ff6a8 --- /dev/null +++ b/telegraf/runner-telegraf.conf @@ -0,0 +1,216 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply surround +# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"), +# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR}) + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 1000000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Collection offset is used to shift the collection by the given amount. + ## This can be be used to avoid many plugins querying constraint devices + ## at the same time by manually scheduling them in time. + # collection_offset = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "60s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## Collected metrics are rounded to the precision specified. Precision is + ## specified as an interval with an integer + unit (e.g. 0s, 10ms, 2us, 4s). + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + ## + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s: + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + precision = "0s" + + ## Log at debug level. + # debug = false + ## Log only error level messages. + # quiet = false + + ## Log target controls the destination for logs and can be one of "file", + ## "stderr" or, on Windows, "eventlog". When set to "file", the output file + ## is determined by the "logfile" setting. + # logtarget = "file" + + ## Name of the file to be logged to when using the "file" logtarget. If set to + ## the empty string then logs are written to stderr. + # logfile = "" + + ## The logfile will be rotated after the time interval specified. When set + ## to 0 no time based rotation is performed. Logs are rotated only when + ## written to, if there is no log activity rotation may be delayed. + # logfile_rotation_interval = "0h" + + ## The logfile will be rotated when it becomes larger than the specified + ## size. When set to 0 no size based rotation is performed. + # logfile_rotation_max_size = "0MB" + + ## Maximum number of rotated archives to keep, any older logs are deleted. + ## If set to -1, no archives are removed. + # logfile_rotation_max_archives = 5 + + ## Pick a timezone to use when logging or type 'local' for local time. + ## Example: America/Chicago + # log_with_timezone = "" + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + + ## Method of translating SNMP objects. Can be "netsnmp" (deprecated) which + ## translates by calling external programs snmptranslate and snmptable, + ## or "gosmi" which translates using the built-in gosmi library. + # snmp_translator = "netsnmp" + + ## Name of the file to load the state of plugins from and store the state to. + ## If uncommented and not empty, this file will be used to save the state of + ## stateful plugins on termination of Telegraf. If the file exists on start, + ## the state in the file will be restored for the plugins. + # statefile = "" + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + + +# Configuration for sending metrics to InfluxDB 2.0 +[[outputs.influxdb_v2]] + ## The URLs of the InfluxDB cluster nodes. + ## + ## Multiple URLs can be specified for a single cluster, only ONE of the + ## urls will be written to each interval. + ## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"] + urls = ["https://ifdb.holochain.org"] + + ## Token for authentication. + token = "${INFLUX_TOKEN}" + + ## Organization is the name of the organization you wish to write to. + organization = "holo" + + ## Destination bucket to write into. + bucket = "windtunnel" + + ## The value of this tag will be used to determine the bucket. If this + ## tag is not set the 'bucket' option is used as the default. + # bucket_tag = "" + + ## If true, the bucket tag will not be added to the metric. + # exclude_bucket_tag = false + + ## Timeout for HTTP messages. + # timeout = "5s" + + ## Additional HTTP headers + # http_headers = {"X-Special-Header" = "Special-Value"} + + ## HTTP Proxy override, if unset values the standard proxy environment + ## variables are consulted to determine which proxy, if any, should be used. + # http_proxy = "http://corporate.proxy:3128" + + ## HTTP User-Agent + # user_agent = "telegraf" + + ## Content-Encoding for write request body, can be set to "gzip" to + ## compress body or "identity" to apply no encoding. + # content_encoding = "gzip" + + ## Enable or disable uint support for writing uints influxdb 2.0. + # influx_uint_support = false + + ## HTTP/2 Timeouts + ## The following values control the HTTP/2 client's timeouts. These settings + ## are generally not required unless a user is seeing issues with client + ## disconnects. If a user does see issues, then it is suggested to set these + ## values to "15s" for ping timeout and "30s" for read idle timeout and + ## retry. + ## + ## Note that the timer for read_idle_timeout begins at the end of the last + ## successful write and not at the beginning of the next write. + # ping_timeout = "0s" + # read_idle_timeout = "0s" + + ## Optional TLS Config for use on HTTP connections. + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + + +[[inputs.file]] + ## Files to parse each interval. Accept standard unix glob matching rules, + ## as well as ** to match recursive files and directories. + files = ["${WT_METRICS_DIR}/*.influx"] + + ## Character encoding to use when interpreting the file contents. Invalid + ## characters are replaced using the unicode replacement character. When set + ## to the empty string the data is not decoded to text. + ## ex: character_encoding = "utf-8" + ## character_encoding = "utf-16le" + ## character_encoding = "utf-16be" + ## character_encoding = "" + character_encoding = "utf-8" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + + + ## Name a tag containing the name of the file the data was parsed from. Leave empty + ## to disable. Cautious when file name variation is high, this can increase the cardinality + ## significantly. Read more about cardinality here: + ## https://docs.influxdata.com/influxdb/cloud/reference/glossary/#series-cardinality + # file_tag = "" +