Complete redo of workflow, switch to compute-benchmarks

intel · Sep 6, 2024 · 940e3be · 940e3be
1 parent 6d14a32
commit 940e3be
Show file tree

Hide file tree

Showing 8 changed files with 356 additions and 106 deletions.
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
@@ -367,7 +367,7 @@ jobs:
  - name: Run sycl-bench microbenchmarks
  id: run_benchmarks
  if: inputs.tests_selector == 'benchmark'
- run: ./devops/scripts/sycl-bench.sh https://github.com/ianayl/sycl-bench
+ run: ./devops/scripts/benchmarking/benchmark.sh
  - name: Upload sycl-bench microbenchmark results
  if: inputs.tests_selector == 'benchmark' && steps.run_benchmarks.outcome == 'success'
  uses: actions/upload-artifact@v4

diff --git a/devops/scripts/benchmarking/aggregate.py b/devops/scripts/benchmarking/aggregate.py
@@ -0,0 +1,70 @@
+import csv
+import sys
+from pathlib import Path
+import heapq
+
+import common
+
+class StreamingMedian:
+
+ def __init__(self):
+ self.minheap_larger = []
+ self.maxheap_smaller = []
+ # Note: numbers on maxheap should be negative, as heapq
+ # is minheap by default
+
+ def add(self, n: float):
+ if len(self.maxheap_smaller) == 0 or -self.maxheap_smaller[0] >= n:
+ heapq.heappush(self.maxheap_smaller, -n)
+ else:
+ heapq.heappush(self.minheap_larger, n)
+
+ if len(self.maxheap_smaller) > len(self.minheap_larger) + 1:
+ heapq.heappush(self.minheap_larger,
+ -heapq.heappop(self.maxheap_smaller))
+ elif len(self.maxheap_smaller) < len(self.minheap_larger):
+ heapq.heappush(self.maxheap_smaller,
+ -heapq.heappop(self.minheap_larger))
+
+ def get_median(self) -> float:
+ if len(self.maxheap_smaller) == len(self.minheap_larger):
+ return (-self.maxheap_smaller[0] + self.minheap_larger[0]) / 2.0
+ else:
+ return -self.maxheap_smaller[0]
+
+
+def aggregate_median(benchmark: str):
+
+ def csv_samples() -> list[str]:
+ # TODO check that the path below is valid directory
+ with Path(f"{common.PERF_RES_PATH}/{benchmark}") as cache_dir:
+ # TODO check for time range; What time range do I want?
+ return filter(lambda f: f.is_file(),
+ cache_dir.glob(f"{benchmark}-*.csv"))
+
+ # Calculate median of every desired metric:
+ aggregate_s = dict()
+ for sample_path in csv_samples():
+ with open(sample_path, mode='r') as sample_file:
+ for s in csv.DictReader(sample_file):
+ if s["TestCase"] not in aggregate_s:
+ aggregate_s[s["TestCase"]] = \
+ { metric: StreamingMedian() for metric in common.metrics_variance }
+ for metric in common.metrics_variance:
+ aggregate_s[s["TestCase"]][metric].add(common.sanitize(s[metric]))
+
+ with open(f"{common.PERF_RES_PATH}/{benchmark}/{benchmark}-median.csv", 'w') as output_csv:
+ writer = csv.DictWriter(output_csv,
+ fieldnames=["TestCase", *common.metrics_variance.keys()])
+ writer.writeheader()
+ for test_case in aggregate_s:
+ writer.writerow({ "TestCase": test_case } | 
+ { metric: aggregate_s[test_case][metric].get_median() 
+ for metric in common.metrics_variance })
+
+
+if __name__ == "__main__":
+ if len(sys.argv) < 2:
+ print(f"Usage: {sys.argv[0]} <test case name>")
+ exit()
+ aggregate_median(sys.argv[1])
diff --git a/devops/scripts/benchmarking/benchmark-ci.conf b/devops/scripts/benchmarking/benchmark-ci.conf
@@ -0,0 +1,26 @@
+# Git branch settings for llvm-ci-perf-results
+PERF_RES_GIT_REPO="https://github.com/intel-sandbox/llvm-ci-perf-results"
+PERF_RES_BRANCH="test-compute-bench"
+# Path where llvm-ci-perf-results are cloned
+PERF_RES_PATH="./llvm-ci-perf-res"
+
+# Git branch settings for compute-benchmarks
+COMPUTE_BENCH_GIT_REPO="https://github.com/ianayl/compute-benchmarks"
+COMPUTE_BENCH_BRANCH="update-sycl"
+
+# Path to compile and build compute-benchmarks
+COMPUTE_BENCH_PATH="./compute-benchmarks"
+
+# Path to temporarily store compute-benchmark results
+OUTPUT_PATH="."
+
+# Metrics to benchmark, and their allowed variance as a Python dictionary 
+METRICS_VARIANCE='{"Median": 0.5}'
+#METRICS_VARIANCE='{"Median": 0.5, "StdDev": 4.0}'
+
+# Metrics to record using aggregate.py
+METRICS_RECORDED='["Median", "StdDev"]'
+
+# Threshold to store benchmark files before benchmarking
+AVERAGE_THRESHOLD=7
+# TODO reconsider this
diff --git a/devops/scripts/benchmarking/benchmark.sh b/devops/scripts/benchmarking/benchmark.sh
@@ -0,0 +1,170 @@
+#!/bin/sh
+
+#
+# benchmark.sh: Benchmark dpcpp using compute-benchmarks
+#
+
+# TODO fix
+usage () {
+ >&2 echo "Usage: $0 <compute-benchmarks git repo> [-B <compute-benchmarks build path>]
+ -B Path to clone and build compute-benchmarks on
+
+This script builds and runs benchmarks from compute-benchmarks."
+ exit 1
+}
+
+clone_perf_res() {
+ echo "### Cloning llvm-ci-perf-res ($PERF_RES_GIT_REPO:$PERF_RES_BRANCH) ###"
+ mkdir -p "$(dirname $PERF_RES_PATH)"
+ git clone -b $PERF_RES_BRANCH $PERF_RES_GIT_REPO $PERF_RES_PATH
+ [ "$?" -ne 0 ] && exit $? 
+}
+
+clone_compute_bench() {
+ echo "### Cloning compute-benchmarks ($COMPUTE_BENCH_GIT_REPO:$COMPUTE_BENCH_BRANCH) ###"
+ mkdir -p "$(dirname $COMPUTE_BENCH_PATH)"
+ git clone -b $COMPUTE_BENCH_BRANCH \
+ --recurse-submodules $COMPUTE_BENCH_GIT_REPO \
+ $COMPUTE_BENCH_PATH
+ [ "$?" -ne 0 ] && exit $? 
+}
+
+build_compute_bench() {
+ echo "### Building compute-benchmarks ($COMPUTE_BENCH_GIT_REPO:$COMPUTE_BENCH_BRANCH) ###"
+ mkdir $COMPUTE_BENCH_PATH/build && cd $COMPUTE_BENCH_PATH/build &&
+ cmake .. -DBUILD_SYCL=ON && cmake --build .
+ compute_bench_build_stat=$?
+ cd -
+ [ "$compute_bench_build_stat" -ne 0 ] && exit $compute_bench_build_stat 
+}
+
+print_bench_res() {
+ # Usage: print_bench_res <benchmark output .csv file> <benchmark status code> <summary file>
+ if [ ! -s $1 ]; then
+ printf "NO OUTPUT! (Status $2)\n" | tee -a $3
+ return # Do not proceed if file is empty
+ fi
+
+ get_csv_col_index $1 run-time-mean
+ tmp_run_time_mean_i=$tmp_csv_col_i
+ get_csv_col_index $1 run-time-median
+ tmp_run_time_median_i=$tmp_csv_col_i
+ get_csv_col_index $1 run-time-throughput
+ tmp_run_time_throughput_i=$tmp_csv_col_i
+
+ # `sycl-bench` output seems to like inserting the header multiple times.
+ # Here we cache the header to make sure it prints only once:
+ tmp_header_title="$(cat $1 | head -n 1 | sed 's/^\# Benchmark name/benchmark/')"
+ tmp_result="$(cat $1 | grep '^[^\#]')"
+
+ printf "%s\n%s" "$tmp_header_title" "$tmp_result" \
+ | awk -F',' -v me="$tmp_run_time_mean_i" \
+ -v md="$tmp_run_time_median_i" \
+ -v th="$tmp_run_time_throughput_i" \
+ '{printf "%-57s %-13s %-15s %-20s\n", $1, $me, $md, $th }' \
+ | tee -a $3 # Print to summary file
+}
+
+###
+STATUS_SUCCESS=0
+STATUS_FAILED=1
+###
+
+samples_under_threshold () {
+ mkdir -p $1
+ file_count="$(find $1 -maxdepth 1 -type f | wc -l )"
+ [ "$file_count" -lt "$AVERAGE_THRESHOLD" ]
+}
+
+check_regression() {
+ if samples_under_threshold "$PERF_RES_PATH/$1"; then
+ echo "Not enough samples to construct an average, performance check skipped!"
+ return $STATUS_SUCCESS
+ fi
+ BENCHMARKING_ROOT="$BENCHMARKING_ROOT" python "$BENCHMARKING_ROOT/compare.py" "$1" "$2"
+ return $?
+ # return $STATUS_FAILED
+}
+
+cache() {
+ mv "$2" "$PERF_RES_PATH/$1/"
+}
+
+# Check for a regression, and cache if no regression found
+check_and_cache() {
+ echo "Checking $testcase..."
+ if check_regression $1 $2; then
+ echo "Caching $testcase..."
+ cache $1 $2
+ else
+ echo "Not caching!"
+ fi
+}
+
+process_benchmarks() {
+ TIMESTAMP="$(date '+%Y%m%d_%H%M%S')"
+ mkdir -p "$PERF_RES_PATH"
+
+ echo "### Running and processing selected benchmarks ###"
+ if [ -z "$TESTS_CONFIG" ]; then
+ echo "Setting tests to run via cli is not currently supported."
+ exit $STATUS_FAILED
+ else
+ # Ignore lines in the test config starting with #'s
+ grep "^[^#]" "$TESTS_CONFIG" | while read -r testcase; do
+ echo "# Running $testcase..."
+ test_csv_output="$OUTPUT_PATH/$testcase-$TIMESTAMP.csv"
+ $COMPUTE_BENCH_PATH/build/bin/$testcase --csv | tail +8 > "$test_csv_output"
+ # The tail +8 filters out initial debug prints not in csv format
+ if [ "$?" -eq 0 ] && [ -s "$test_csv_output" ]; then 
+ check_and_cache $testcase $test_csv_output
+ else
+ echo "ERROR @ $test_case"
+ fi
+ done
+ fi
+}
+
+cleanup() {
+ rm -r $COMPUTE_BENCH_PATH
+}
+
+load_configs() {
+ # This script needs to know where the "BENCHMARKING_ROOT" directory is,
+ # containing all the configuration files and the compare script.
+ #
+ # If this is not provided, this function tries to guess where the files
+ # are based on how the script is called, and verifies that all necessary
+ # configs and scripts are reachable. 
+ [ -z "$BENCHMARKING_ROOT" ] && BENCHMARKING_ROOT="$(dirname $0)"
+
+ BENCHMARK_CI_CONFIG="$BENCHMARKING_ROOT/benchmark-ci.conf"
+ TESTS_CONFIG="$BENCHMARKING_ROOT/enabled_tests.conf"
+ COMPARE_PATH="$BENCHMARKING_ROOT/compare.py"
+
+ for file in "$BENCHMARK_CI_CONFIG" "$TESTS_CONFIG" "$COMPARE_PATH"; do
+ if [ ! -f "$file" ]; then
+ echo "$(basename $file) not found, please provide path to BENCHMARKING_ROOT."
+ exit -1
+ fi
+ done
+
+ . $BENCHMARK_CI_CONFIG
+}
+
+load_configs
+
+# CLI overrides to configuration options
+while getopts "p:b:r:" opt; do
+ case $opt in
+ p) COMPUTE_BENCH_PATH=$OPTARG ;;
+ r) COMPUTE_BENCH_GIT_REPO=$OPTARG ;;
+ b) COMPUTE_BENCH_BRANCH=$OPTARG ;;
+ \?) usage ;;
+ esac
+done
+
+[ ! -d "$PERF_RES_PATH" ] && clone_perf_res
+[ ! -d "$COMPUTE_BENCH_PATH" ] && clone_compute_bench
+[ ! -d "$COMPUTE_BENCH_PATH/build" ] && build_compute_bench
+process_benchmarks
diff --git a/devops/scripts/benchmarking/common.py b/devops/scripts/benchmarking/common.py
@@ -0,0 +1,43 @@
+import os
+import re
+import ast
+
+PERF_RES_PATH, metrics_variance, metrics_recorded = None, None, None
+
+def sanitize(stat: str) -> float:
+ # Get rid of %
+ if stat[-1] == '%':
+ stat = stat[:-1]
+ return float(stat)
+
+
+def load_configs():
+ BENCHMARKING_ROOT = os.getenv("BENCHMARKING_ROOT")
+ if BENCHMARKING_ROOT is None:
+ # Try to predict where BENCHMARKING_ROOT is based on executable
+ BENCHMARKING_ROOT = os.path.dirname(os.path.abspath(__file__))
+
+ benchmarking_ci_conf_path = f"{BENCHMARKING_ROOT}/benchmark-ci.conf"
+ if not os.path.isfile(benchmarking_ci_conf_path):
+ raise Exception(f"Please provide path to a valid BENCHMARKING_ROOT.")
+
+ global PERF_RES_PATH, metrics_variance, metrics_recorded
+ perf_res_re = re.compile(r'^PERF_RES_PATH=(.*)$', re.M)
+ m_variance_re = re.compile(r'^METRICS_VARIANCE=(.*)$', re.M)
+ m_recorded_re = re.compile(r'^METRICS_RECORDED=(.*)$', re.M)
+
+ with open(benchmarking_ci_conf_path, 'r') as configs_file:
+ configs_str = configs_file.read()
+
+ for m_variance in m_variance_re.findall(configs_str):
+ metrics_variance = ast.literal_eval(m_variance.strip()[1:-1])
+ if not isinstance(metrics_variance, dict):
+ raise TypeError("Error in benchmark-ci.conf: METRICS_VARIANCE is not a python dict.")
+
+ for m_recorded in m_recorded_re.findall(configs_str):
+ metrics_recorded = ast.literal_eval(m_recorded.strip()[1:-1])
+ if not isinstance(metrics_recorded, list):
+ raise TypeError("Error in benchmark-ci.conf: METRICS_RECORDED is not a python list.")
+
+ for perf_res in perf_res_re.findall(configs_str):
+ PERF_RES_PATH = str(perf_res[1:-1])
diff --git a/devops/scripts/benchmarking/compare.py b/devops/scripts/benchmarking/compare.py
@@ -0,0 +1,43 @@
+import csv
+import sys
+from pathlib import Path
+
+import common 
+
+# TODO compare_to(metric) instead?
+def compare_to_median(test_name: str, test_csv_path: str):
+ median = dict()
+ with open(f"{common.PERF_RES_PATH}/{test_name}/{test_name}-median.csv", mode='r') as median_csv:
+ for stat in csv.DictReader(median_csv):
+ median[stat["TestCase"]] = \
+ { metric: float(stat[metric]) for metric in common.metrics_variance }
+
+ # TODO read status codes from a config file
+ status = 0
+ failure_counts = { metric: 0 for metric in common.metrics_variance }
+ with open(test_csv_path, mode='r') as sample_csv:
+ for sample in csv.DictReader(sample_csv):
+ # Ignore test cases we haven't profiled before
+ if sample["TestCase"] not in median:
+ continue
+ test_median = median[sample["TestCase"]]
+ for metric, threshold in common.metrics_variance.items():
+ max_tolerated = test_median[metric] * (1 + threshold)
+ if common.sanitize(sample[metric]) > max_tolerated:
+ print("vvv FAILED vvv")
+ print(sample['TestCase'])
+ print(f"{metric}: {metric} {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tolerance {threshold*100}% -- {max_tolerated})")
+ print("^^^^^^^^^^^^^^")
+ status = 1
+ failure_counts[metric] += 1
+ if status != 0:
+ print(f"Failure counts: {failure_counts}")
+ return status
+
+
+if __name__ == "__main__":
+ if len(sys.argv) < 3:
+ print(f"Usage: {sys.argv[0]} <test name> <test csv path>")
+ exit(-1)
+ common.load_configs()
+ exit(compare_to_median(sys.argv[1], sys.argv[2]))
diff --git a/devops/scripts/benchmarking/enabled_tests.conf b/devops/scripts/benchmarking/enabled_tests.conf
@@ -0,0 +1,3 @@
+# Test cases to be enabled:
+api_overhead_benchmark_sycl
+memory_benchmark_sycl