From 8d23fa510d9aa7abf5f33fc01fa92d6157e54b1c Mon Sep 17 00:00:00 2001 From: Callum Dunster Date: Thu, 12 Sep 2024 15:14:18 +0200 Subject: [PATCH] chore(ci): use a job matrix instead of a single job --- .github/workflows/performance.yaml | 147 ++------------------ telegraf/runner-telegraf.conf | 216 +++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+), 135 deletions(-) create mode 100644 telegraf/runner-telegraf.conf diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml index 55d13f26..9757fbdd 100644 --- a/.github/workflows/performance.yaml +++ b/.github/workflows/performance.yaml @@ -5,151 +5,28 @@ on: branches: [ main, develop ] workflow_dispatch: +env: + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} + WT_METRICS_DIR: "${{ github.workspace }}/telegraf/metrics" + jobs: test: runs-on: [self-hosted, wind-tunnel] + strategy: + fail-fast: false + matrix: + scenario: [ zome_call_single_value ] steps: - uses: actions/checkout@v4 - - name: Smoke test - zome_call_single_value - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#zome_call_single_value -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - single_write_many_read - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#single_write_many_read -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - dht_sync_lag - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#dht_sync_lag -- --connection-string ws://localhost:8888 --agents 2 --behaviour write:1 --behaviour record_lag:1 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - app_install - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#app_install -- --connection-string ws://localhost:8888 --agents 2 --behaviour minimal:1 --behaviour large:1 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - first_call + - name: Smoke test - ${{ matrix.scenario }} run: | # Start a sandbox conductor and run it in the background nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - RUST_LOG=info nix run .#first_call -- --connection-string ws://localhost:8888 --agents 1 --behaviour local:1 --duration 120 --no-progress --reporter influx-file + RUST_LOG=info nix run .#${{ matrix.scenario }} -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file pkill hc && pkill holochain && pkill lair-keystore - - name: Smoke test - write_read - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#write_read -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - write_query - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#write_query -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - local_signals - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#local_signals -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - write_validated - run: | - # Start a sandbox conductor and run it in the background - nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &" - - RUST_LOG=info nix run .#write_validated -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file - - pkill hc && pkill holochain && pkill lair-keystore - - - name: Smoke test - trycp_write_validated - run: | - set -x - - # Start local network services - nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &" - # Start a TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &" - - RUST_LOG=warn CONDUCTOR_CONFIG="CI" MIN_PEERS=2 nix run .#trycp_write_validated -- --targets targets-ci.yaml --instances-per-target 2 --duration 120 --no-progress --reporter influx-file - - # Stop the TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp" - # Stop local network services - pkill hc-run-local - - - name: Smoke test - remote_call_rate - run: | - set -x - - # Start local network services - nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &" - # Start a TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &" - - RUST_LOG=warn CONDUCTOR_CONFIG="CI" MIN_PEERS=2 nix run .#remote_call_rate -- --targets targets-ci.yaml --instances-per-target 2 --duration 120 --no-progress --reporter influx-file - - # Stop the TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp" - # Stop local network services - pkill hc-run-local - - - name: Smoke test - two_party_countersigning - run: | - # Start local network services - nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &" - # Start a TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &" - - RUST_LOG=warn CONDUCTOR_CONFIG="CI" MIN_PEERS=2 nix run .#two_party_countersigning -- --targets targets-ci.yaml --behaviour initiate:1 --behaviour participate:1 --instances-per-target 2 --duration 120 --no-progress --reporter influx-file - - # Stop the TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp" - # Stop local network services - pkill hc-run-local - - - name: Smoke test - validation_receipts - run: | - set -x - - # Start local network services - nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &" - # Start a TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &" - - RUST_LOG=warn CONDUCTOR_CONFIG="CI" MIN_PEERS=2 nix run .#validation_receipts -- --targets targets-ci.yaml --instances-per-target 2 --duration 120 --no-progress --reporter influx-file - - # Stop the TryCP instance - nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp" - # Stop local network services - pkill hc-run-local + - name: Run Telegraf to upload influx metrics + run: nix run nixpkgs#telegraf -- --config telegraf/runner-telegraf.conf --once diff --git a/telegraf/runner-telegraf.conf b/telegraf/runner-telegraf.conf new file mode 100644 index 00000000..7486953a --- /dev/null +++ b/telegraf/runner-telegraf.conf @@ -0,0 +1,216 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply surround +# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"), +# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR}) + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Collection offset is used to shift the collection by the given amount. + ## This can be be used to avoid many plugins querying constraint devices + ## at the same time by manually scheduling them in time. + # collection_offset = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## Collected metrics are rounded to the precision specified. Precision is + ## specified as an interval with an integer + unit (e.g. 0s, 10ms, 2us, 4s). + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + ## + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s: + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + precision = "0s" + + ## Log at debug level. + # debug = false + ## Log only error level messages. + # quiet = false + + ## Log target controls the destination for logs and can be one of "file", + ## "stderr" or, on Windows, "eventlog". When set to "file", the output file + ## is determined by the "logfile" setting. + # logtarget = "file" + + ## Name of the file to be logged to when using the "file" logtarget. If set to + ## the empty string then logs are written to stderr. + # logfile = "" + + ## The logfile will be rotated after the time interval specified. When set + ## to 0 no time based rotation is performed. Logs are rotated only when + ## written to, if there is no log activity rotation may be delayed. + # logfile_rotation_interval = "0h" + + ## The logfile will be rotated when it becomes larger than the specified + ## size. When set to 0 no size based rotation is performed. + # logfile_rotation_max_size = "0MB" + + ## Maximum number of rotated archives to keep, any older logs are deleted. + ## If set to -1, no archives are removed. + # logfile_rotation_max_archives = 5 + + ## Pick a timezone to use when logging or type 'local' for local time. + ## Example: America/Chicago + # log_with_timezone = "" + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + + ## Method of translating SNMP objects. Can be "netsnmp" (deprecated) which + ## translates by calling external programs snmptranslate and snmptable, + ## or "gosmi" which translates using the built-in gosmi library. + # snmp_translator = "netsnmp" + + ## Name of the file to load the state of plugins from and store the state to. + ## If uncommented and not empty, this file will be used to save the state of + ## stateful plugins on termination of Telegraf. If the file exists on start, + ## the state in the file will be restored for the plugins. + # statefile = "" + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + + +# Configuration for sending metrics to InfluxDB 2.0 +[[outputs.influxdb_v2]] + ## The URLs of the InfluxDB cluster nodes. + ## + ## Multiple URLs can be specified for a single cluster, only ONE of the + ## urls will be written to each interval. + ## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"] + urls = ["https://ifdb.holochain.org"] + + ## Token for authentication. + token = "${INFLUX_TOKEN}" + + ## Organization is the name of the organization you wish to write to. + organization = "holo" + + ## Destination bucket to write into. + bucket = "windtunnel" + + ## The value of this tag will be used to determine the bucket. If this + ## tag is not set the 'bucket' option is used as the default. + # bucket_tag = "" + + ## If true, the bucket tag will not be added to the metric. + # exclude_bucket_tag = false + + ## Timeout for HTTP messages. + # timeout = "5s" + + ## Additional HTTP headers + # http_headers = {"X-Special-Header" = "Special-Value"} + + ## HTTP Proxy override, if unset values the standard proxy environment + ## variables are consulted to determine which proxy, if any, should be used. + # http_proxy = "http://corporate.proxy:3128" + + ## HTTP User-Agent + # user_agent = "telegraf" + + ## Content-Encoding for write request body, can be set to "gzip" to + ## compress body or "identity" to apply no encoding. + # content_encoding = "gzip" + + ## Enable or disable uint support for writing uints influxdb 2.0. + # influx_uint_support = false + + ## HTTP/2 Timeouts + ## The following values control the HTTP/2 client's timeouts. These settings + ## are generally not required unless a user is seeing issues with client + ## disconnects. If a user does see issues, then it is suggested to set these + ## values to "15s" for ping timeout and "30s" for read idle timeout and + ## retry. + ## + ## Note that the timer for read_idle_timeout begins at the end of the last + ## successful write and not at the beginning of the next write. + # ping_timeout = "0s" + # read_idle_timeout = "0s" + + ## Optional TLS Config for use on HTTP connections. + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + + +[[inputs.file]] + ## Files to parse each interval. Accept standard unix glob matching rules, + ## as well as ** to match recursive files and directories. + files = ["${WT_METRICS_DIR}/*.influx"] + + ## Character encoding to use when interpreting the file contents. Invalid + ## characters are replaced using the unicode replacement character. When set + ## to the empty string the data is not decoded to text. + ## ex: character_encoding = "utf-8" + ## character_encoding = "utf-16le" + ## character_encoding = "utf-16be" + ## character_encoding = "" + character_encoding = "utf-8" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" + + + ## Name a tag containing the name of the file the data was parsed from. Leave empty + ## to disable. Cautious when file name variation is high, this can increase the cardinality + ## significantly. Read more about cardinality here: + ## https://docs.influxdata.com/influxdb/cloud/reference/glossary/#series-cardinality + # file_tag = "" +