Skip to content

Commit

Permalink
Add self-hosted runner and manual execution of performance tests (#88)
Browse files Browse the repository at this point in the history
* chore(ci): add performance tests workflow with manual trigger

* chore(ci): set the tests reporter to be influx-file

* chore(ci): update duration of performance tests to 120 seconds

* chore(ci): add extra-args option for scenarios that need it

* chore(ci): add printing of telegraf errors and warnings

* chore(ci): get Run ID from logs and set as output and summary

* chore(ci): add step to upload logs as artifacts

* chore(ci): add link to logs in job summary

* chore(ci): increase log level of TryCP tests to info

* chore(ci): add script to flake to call telegraf as expected in the CI
This is so that we use the telegraf version from the lockfile.

* docs(ci): remove out-dated TODO

* chore(ci): split the app_install scenario in two
This tests installing the minimally-small and large hApps separately.
  • Loading branch information
cdunster authored Sep 17, 2024
1 parent ed10372 commit aadc606
Show file tree
Hide file tree
Showing 4 changed files with 373 additions and 1 deletion.
152 changes: 152 additions & 0 deletions .github/workflows/performance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
name: "Performance Tests"

on:
workflow_dispatch:

env:
INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }}
WT_METRICS_DIR: "${{ github.workspace }}/telegraf/metrics"

jobs:
local-test:
runs-on: [self-hosted, wind-tunnel]
strategy:
fail-fast: false
matrix:
# To run a local test with default configuration, add the scenario name to this array.
scenario: [ zome_call_single_value, single_write_many_read, write_read, write_query, local_signals, write_validated ]
# To run a local test with additional configuration, add the scenario name and `extra-args` as an `include` item.
include:
- scenario: dht_sync_lag
extra-args: "--agents 2 --behaviour write:1 --behaviour record_lag:1"

# Test how long it takes to install a minimally-small hApp.
- scenario: app_install
extra-args: "--behaviour minimal:1"

# Test how long it takes to install a large hApp.
- scenario: app_install
extra-args: "--behaviour large:1"

- scenario: first_call
extra-args: "--agents 1 --behaviour local:1"
steps:
- uses: actions/checkout@v4

- name: Run - ${{ matrix.scenario }}
id: run_test
run: |
# Start a sandbox conductor and run it in the background
nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &"
RUST_LOG=info nix run .#${{ matrix.scenario }} -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file ${{ matrix.extra-args }} > >(tee logs/scenario-stdout.log) 2> >(tee logs/scenario-stderr.log >&2)
pkill hc && pkill holochain && pkill lair-keystore
RUN_ID=$(grep -m1 "#RunId" logs/scenario-stdout.log | sed 's/#RunId: \[\(.\+\)\]/\1/')
echo "RUN_ID=$RUN_ID" >> "$GITHUB_OUTPUT"
echo "# Run ID: $RUN_ID" >> $GITHUB_STEP_SUMMARY
- name: Run Telegraf to upload influx metrics
run: |
if ! nix run .#ci-telegraf
then
echo "::group::Telegraf errors"
status=1
# Print errors as such in GitHub logs.
grep "E!" logs/telegraf-stderr.log | xargs -l echo "::error ::"
echo "::endgroup::"
fi
echo "::group::Telegraf warnings"
# Print warnings as such in GitHub logs.
grep "W!" logs/telegraf-stderr.log | xargs -l echo "::warning ::"
echo "::endgroup::"
exit ${status-0}
- name: Upload logs as artifacts
if: success() || failure()
id: upload-artifact
uses: actions/upload-artifact@v4
with:
name: "logs_${{ matrix.scenario }}_${{ steps.run_test.outputs.RUN_ID }}"
path: |
logs/scenario-stdout.log
logs/scenario-stderr.log
logs/telegraf-stdout.log
logs/telegraf-stderr.log
- name: Output Path to logs in summary
run: |
echo "# Logs: [${{ steps.upload-artifact.outputs.artifact-id }}](${{ steps.upload-artifact.outputs.artifact-url }})" >> $GITHUB_STEP_SUMMARY
trycp-test:
runs-on: [self-hosted, wind-tunnel]
strategy:
fail-fast: false
matrix:
# To run a test with TryCP and default configuration, add the scenario name to this array.
scenario: [ trycp_write_validated, remote_call_rate, validation_receipts ]
# To run a test with TryCP and additional configuration, add the scenario name and `extra-args` as an `include` item.
include:
- scenario: two_party_countersigning
extra-args: "--behaviour initiate:1 --behaviour participate:1"
steps:
- uses: actions/checkout@v4

- name: Run - ${{ matrix.scenario }}
id: run_test
run: |
set -x
# Start local network services
nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &"
# Start a TryCP instance
nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &"
RUST_LOG=info CONDUCTOR_CONFIG="CI" TRYCP_RUST_LOG="info" MIN_PEERS=2 nix run .#${{ matrix.scenario }} -- --targets targets-ci.yaml --instances-per-target 2 --duration 120 --no-progress --reporter influx-file ${{ matrix.extra-args }} > >(tee logs/scenario-stdout.log) 2> >(tee logs/scenario-stderr.log >&2)
# Stop the TryCP instance
nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp"
# Stop local network services
pkill hc-run-local
RUN_ID=$(grep -m1 "#RunId" logs/scenario-stdout.log | sed 's/#RunId: \[\(.\+\)\]/\1/')
echo "RUN_ID=$RUN_ID" >> "$GITHUB_OUTPUT"
echo "# Run ID: $RUN_ID" >> $GITHUB_STEP_SUMMARY
- name: Run Telegraf to upload influx metrics
run: |
if ! nix run .#ci-telegraf
then
echo "::group::Telegraf errors"
status=1
# Print errors as such in GitHub logs.
grep "E!" logs/telegraf-stderr.log | xargs -l echo "::error ::"
echo "::endgroup::"
fi
echo "::group::Telegraf warnings"
# Print warnings as such in GitHub logs.
grep "W!" logs/telegraf-stderr.log | xargs -l echo "::warning ::"
echo "::endgroup::"
exit ${status-0}
- name: Upload logs as artifacts
if: success() || failure()
id: upload-artifact
uses: actions/upload-artifact@v4
with:
name: "logs_${{ matrix.scenario }}_${{ steps.run_test.outputs.RUN_ID }}"
path: |
logs/scenario-stdout.log
logs/scenario-stderr.log
logs/telegraf-stdout.log
logs/telegraf-stderr.log
logs/${{ steps.run_test.outputs.RUN_ID }}/
- name: Output Path to logs in summary
run: |
echo "# Logs: [${{ steps.upload-artifact.outputs.artifact-id }}](${{ steps.upload-artifact.outputs.artifact-url }})" >> $GITHUB_STEP_SUMMARY
1 change: 0 additions & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ jobs:
# Start a sandbox conductor and run it in the background
nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &"
# TODO using `localhost` is resolving to an IPv6 address, but why is that giving a connection refused?
# Run the scenario for 5 seconds
RUST_LOG=info nix run .#zome_call_single_value -- --connection-string ws://localhost:8888 --duration 5 --no-progress
Expand Down
5 changes: 5 additions & 0 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@
packages = {
default = config.workspace.workspace;
inherit (config.workspace) workspace;
ci-telegraf = pkgs.writeShellApplication {
name = "ci-telegraf";
runtimeInputs = [ pkgs.telegraf ];
text = "telegraf --config telegraf/runner-telegraf.conf --once > >(tee logs/telegraf-stdout.log) 2> >(tee logs/telegraf-stderr.log >&2)";
};
};

checks = {
Expand Down
216 changes: 216 additions & 0 deletions telegraf/runner-telegraf.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# Telegraf Configuration
#
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
#
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
#
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
#
# Environment variables can be used anywhere in this config file, simply surround
# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"),
# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR})


# Configuration for telegraf agent
[agent]
## Default data collection interval for all inputs
interval = "10s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true

## Telegraf will send metrics to outputs in batches of at most
## metric_batch_size metrics.
## This controls the size of writes that Telegraf sends to output plugins.
metric_batch_size = 1000

## Maximum number of unwritten metrics per output. Increasing this value
## allows for longer periods of output downtime without dropping metrics at the
## cost of higher maximum memory usage.
metric_buffer_limit = 1000000

## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"

## Collection offset is used to shift the collection by the given amount.
## This can be be used to avoid many plugins querying constraint devices
## at the same time by manually scheduling them in time.
# collection_offset = "0s"

## Default flushing interval for all outputs. Maximum flush_interval will be
## flush_interval + flush_jitter
flush_interval = "60s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"

## Collected metrics are rounded to the precision specified. Precision is
## specified as an interval with an integer + unit (e.g. 0s, 10ms, 2us, 4s).
## Valid time units are "ns", "us" (or "µs"), "ms", "s".
##
## By default or when set to "0s", precision will be set to the same
## timestamp order as the collection interval, with the maximum being 1s:
## ie, when interval = "10s", precision will be "1s"
## when interval = "250ms", precision will be "1ms"
##
## Precision will NOT be used for service inputs. It is up to each individual
## service input to set the timestamp at the appropriate precision.
precision = "0s"

## Log at debug level.
# debug = false
## Log only error level messages.
# quiet = false

## Log target controls the destination for logs and can be one of "file",
## "stderr" or, on Windows, "eventlog". When set to "file", the output file
## is determined by the "logfile" setting.
# logtarget = "file"

## Name of the file to be logged to when using the "file" logtarget. If set to
## the empty string then logs are written to stderr.
# logfile = ""

## The logfile will be rotated after the time interval specified. When set
## to 0 no time based rotation is performed. Logs are rotated only when
## written to, if there is no log activity rotation may be delayed.
# logfile_rotation_interval = "0h"

## The logfile will be rotated when it becomes larger than the specified
## size. When set to 0 no size based rotation is performed.
# logfile_rotation_max_size = "0MB"

## Maximum number of rotated archives to keep, any older logs are deleted.
## If set to -1, no archives are removed.
# logfile_rotation_max_archives = 5

## Pick a timezone to use when logging or type 'local' for local time.
## Example: America/Chicago
# log_with_timezone = ""

## Override default hostname, if empty use os.Hostname()
hostname = ""
## If set to true, do no set the "host" tag in the telegraf agent.
omit_hostname = false

## Method of translating SNMP objects. Can be "netsnmp" (deprecated) which
## translates by calling external programs snmptranslate and snmptable,
## or "gosmi" which translates using the built-in gosmi library.
# snmp_translator = "netsnmp"

## Name of the file to load the state of plugins from and store the state to.
## If uncommented and not empty, this file will be used to save the state of
## stateful plugins on termination of Telegraf. If the file exists on start,
## the state in the file will be restored for the plugins.
# statefile = ""


###############################################################################
# OUTPUT PLUGINS #
###############################################################################


# Configuration for sending metrics to InfluxDB 2.0
[[outputs.influxdb_v2]]
## The URLs of the InfluxDB cluster nodes.
##
## Multiple URLs can be specified for a single cluster, only ONE of the
## urls will be written to each interval.
## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"]
urls = ["https://ifdb.holochain.org"]

## Token for authentication.
token = "${INFLUX_TOKEN}"

## Organization is the name of the organization you wish to write to.
organization = "holo"

## Destination bucket to write into.
bucket = "windtunnel"

## The value of this tag will be used to determine the bucket. If this
## tag is not set the 'bucket' option is used as the default.
# bucket_tag = ""

## If true, the bucket tag will not be added to the metric.
# exclude_bucket_tag = false

## Timeout for HTTP messages.
# timeout = "5s"

## Additional HTTP headers
# http_headers = {"X-Special-Header" = "Special-Value"}

## HTTP Proxy override, if unset values the standard proxy environment
## variables are consulted to determine which proxy, if any, should be used.
# http_proxy = "http://corporate.proxy:3128"

## HTTP User-Agent
# user_agent = "telegraf"

## Content-Encoding for write request body, can be set to "gzip" to
## compress body or "identity" to apply no encoding.
# content_encoding = "gzip"

## Enable or disable uint support for writing uints influxdb 2.0.
# influx_uint_support = false

## HTTP/2 Timeouts
## The following values control the HTTP/2 client's timeouts. These settings
## are generally not required unless a user is seeing issues with client
## disconnects. If a user does see issues, then it is suggested to set these
## values to "15s" for ping timeout and "30s" for read idle timeout and
## retry.
##
## Note that the timer for read_idle_timeout begins at the end of the last
## successful write and not at the beginning of the next write.
# ping_timeout = "0s"
# read_idle_timeout = "0s"

## Optional TLS Config for use on HTTP connections.
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false


###############################################################################
# INPUT PLUGINS #
###############################################################################


[[inputs.file]]
## Files to parse each interval. Accept standard unix glob matching rules,
## as well as ** to match recursive files and directories.
files = ["${WT_METRICS_DIR}/*.influx"]

## Character encoding to use when interpreting the file contents. Invalid
## characters are replaced using the unicode replacement character. When set
## to the empty string the data is not decoded to text.
## ex: character_encoding = "utf-8"
## character_encoding = "utf-16le"
## character_encoding = "utf-16be"
## character_encoding = ""
character_encoding = "utf-8"

## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"


## Name a tag containing the name of the file the data was parsed from. Leave empty
## to disable. Cautious when file name variation is high, this can increase the cardinality
## significantly. Read more about cardinality here:
## https://docs.influxdata.com/influxdb/cloud/reference/glossary/#series-cardinality
# file_tag = ""

0 comments on commit aadc606

Please sign in to comment.