From 3b5454fd7f1a6726b4d2edf75dedccb6410aeaad Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Wed, 18 Sep 2024 13:02:16 -0700 Subject: [PATCH] Fixed aggregate, improved logging --- .../workflows/sycl-benchmark-aggregate.yml | 2 +- .github/workflows/sycl-linux-benchmark.yml | 7 ++-- devops/scripts/benchmarking/aggregate.py | 6 +-- devops/scripts/benchmarking/benchmark-ci.conf | 13 ++++-- devops/scripts/benchmarking/benchmark.sh | 41 +++++++++++++++---- devops/scripts/benchmarking/common.py | 10 +++++ devops/scripts/benchmarking/compare.py | 11 +++-- 7 files changed, 66 insertions(+), 24 deletions(-) diff --git a/.github/workflows/sycl-benchmark-aggregate.yml b/.github/workflows/sycl-benchmark-aggregate.yml index d85fd78577491..da20d0186680e 100644 --- a/.github/workflows/sycl-benchmark-aggregate.yml +++ b/.github/workflows/sycl-benchmark-aggregate.yml @@ -57,7 +57,7 @@ jobs: with: # TODO populate default values before loading configuration path: ${{ env.PERF_RES_PATH }} - repository: ${{ env.PERF_RES_GIT_REPO }} + repository: intel-sandbox/llvm-ci-perf-results #${{ env.PERF_RES_GIT_REPO }} branch: ${{ env.PERF_RES_BRANCH }} - name: Run aggregator on cloned data run: | diff --git a/.github/workflows/sycl-linux-benchmark.yml b/.github/workflows/sycl-linux-benchmark.yml index 45c728bb56aa6..1a4d4680684cb 100644 --- a/.github/workflows/sycl-linux-benchmark.yml +++ b/.github/workflows/sycl-linux-benchmark.yml @@ -51,7 +51,7 @@ on: cache_results: type: boolean default: false - required: False + required: false workflow_dispatch: inputs: @@ -109,7 +109,7 @@ on: cache_results: type: boolean default: false - required: False + required: false permissions: contents: write @@ -242,8 +242,7 @@ jobs: clang++ --version ls export CMPLR_ROOT=$PWD/toolchain - ./devops/scripts/benchmarking/benchmark.sh - exit $? + ./devops/scripts/benchmarking/benchmark.sh ${{ inputs.cache_results == true && '-s' }} - name: debug -- delete after env: SSH_KEY: ${{secrets.ACTIONS_DEPLOY_KEY}} diff --git a/devops/scripts/benchmarking/aggregate.py b/devops/scripts/benchmarking/aggregate.py index 771ae56136f8e..80b5e81f8ea47 100644 --- a/devops/scripts/benchmarking/aggregate.py +++ b/devops/scripts/benchmarking/aggregate.py @@ -40,13 +40,13 @@ def csv_samples() -> list[str]: with Path(f"{common.PERF_RES_PATH}/{benchmark}") as cache_dir: # TODO check for time range; What time range do I want? return filter(lambda f: f.is_file() and - common.valid_timestamp(str(f)[-13:]) and str(f)[-13:] > cutoff, + common.valid_timestamp(str(f)[-17:-4]) and str(f)[-17:-4] > cutoff, cache_dir.glob(f"{benchmark}-*_*.csv")) # Calculate median of every desired metric: aggregate_s = dict() for sample_path in csv_samples(): - with open(sample_path, mode='r') as sample_file: + with open(sample_path, 'r') as sample_file: for s in csv.DictReader(sample_file): if s["TestCase"] not in aggregate_s: aggregate_s[s["TestCase"]] = \ @@ -71,5 +71,5 @@ def csv_samples() -> list[str]: if not common.valid_timestamp(sys.argv[2]): print(f"Bad cutoff timestamp, please use YYMMDD_HHMMSS.") exit(1) - + common.load_configs() aggregate_median(sys.argv[1], sys.argv[2]) diff --git a/devops/scripts/benchmarking/benchmark-ci.conf b/devops/scripts/benchmarking/benchmark-ci.conf index b08d82fcad1a5..adf023d9a1a2a 100644 --- a/devops/scripts/benchmarking/benchmark-ci.conf +++ b/devops/scripts/benchmarking/benchmark-ci.conf @@ -1,5 +1,5 @@ # Git branch settings for llvm-ci-perf-results -PERF_RES_GIT_REPO="https://github.com/ianayl/llvm-ci-perf-results" +PERF_RES_GIT_REPO="https://github.com/intel-sandbox/llvm-ci-perf-results" PERF_RES_BRANCH="test-compute-bench" # Path where llvm-ci-perf-results are cloned PERF_RES_PATH="./llvm-ci-perf-res" @@ -17,7 +17,7 @@ COMPUTE_BENCH_COMPILE_FLAGS="-j2" OUTPUT_PATH="." # Metrics to benchmark, and their allowed variance as a Python dictionary -METRICS_VARIANCE='{"Median": 0.5}' +METRICS_VARIANCE='{"Median": 0.15}' #METRICS_VARIANCE='{"Median": 0.5, "StdDev": 4.0}' # Metrics to record using aggregate.py @@ -25,9 +25,14 @@ METRICS_RECORDED='["Median", "StdDev"]' # Threshold to store benchmark files before benchmarking # TODO reconsider this -AVERAGE_THRESHOLD=7 +AVERAGE_THRESHOLD=3 # Default period of time to aggregate for the average AVERAGE_CUTOFF_RANGE="7 days ago" # Format of timestamps used (linux `date` format string) -TIMESTAMP_FORMAT='%Y%m%d_%H%M%S' \ No newline at end of file +TIMESTAMP_FORMAT='%Y%m%d_%H%M%S' + +# Log file for test cases that perform over the allowed variance +BENCHMARK_SLOW_LOG="./benchmarks-over_tolerance.log" +# Log file for test cases that errored / failed to build +BENCHMARK_ERROR_LOG="./benchmarks-errored.log" \ No newline at end of file diff --git a/devops/scripts/benchmarking/benchmark.sh b/devops/scripts/benchmarking/benchmark.sh index 5ad82f3c5414a..7fbabda307072 100755 --- a/devops/scripts/benchmarking/benchmark.sh +++ b/devops/scripts/benchmarking/benchmark.sh @@ -73,7 +73,7 @@ build_compute_bench() { ### STATUS_SUCCESS=0 -STATUS_FAILED=1 +STATUS_ERROR=1 ### samples_under_threshold () { @@ -89,7 +89,6 @@ check_regression() { fi BENCHMARKING_ROOT="$BENCHMARKING_ROOT" python "$BENCHMARKING_ROOT/compare.py" "$1" "$2" return $? - # return $STATUS_FAILED } cache() { @@ -100,10 +99,14 @@ cache() { check_and_cache() { echo "Checking $testcase..." if check_regression $1 $2; then - echo "Caching $testcase..." - cache $1 $2 + if [ "$CACHE_RESULTS" -eq "1" ]; then + echo "Caching $testcase..." + cache $1 $2 + fi else - echo "Not caching!" + if [ "$CACHE_RESULTS" -eq "1" ]; then + echo "Not caching!" + fi fi } @@ -113,8 +116,9 @@ process_benchmarks() { echo "### Running and processing selected benchmarks ###" if [ -z "$TESTS_CONFIG" ]; then echo "Setting tests to run via cli is not currently supported." - exit $STATUS_FAILED + exit $STATUS_ERROR else + rm "$BENCHMARK_ERROR_LOG" "$BENCHMARK_SLOW_LOG" 2> /dev/null # Ignore lines in the test config starting with #'s grep "^[^#]" "$TESTS_CONFIG" | while read -r testcase; do echo "# Running $testcase..." @@ -124,16 +128,32 @@ process_benchmarks() { if [ "$?" -eq 0 ] && [ -s "$test_csv_output" ]; then check_and_cache $testcase $test_csv_output else + # TODO consider capturing error for logging echo "ERROR @ $test_case" + echo "-- $testcase: error $?" >> "$BENCHMARK_ERROR_LOG" fi done fi } +process_results() { + if [ -s "$BENCHMARK_SLOW_LOG" ]; then + printf "\n### Tests performing over acceptable range of average: ###\n" + cat "$BENCHMARK_SLOW_LOG" + echo "" + fi + if [ -s "$BENCHMARK_ERROR_LOG" ]; then + printf "\n### Tests that failed to run: ###\n" + cat "$BENCHMARK_ERROR_LOG" + echo "" + fi + [ ! -s "$BENCHMARKING_SLOW_LOG" ] && [ ! -s "$BENCHMARK_ERROR_LOG" ] +} + cleanup() { echo "### Cleaning up compute-benchmark builds from prior runs ###" rm -rf $COMPUTE_BENCH_PATH - #rm -rf $PERF_RES_PATH + rm -rf $PERF_RES_PATH [ ! -z "$_exit_after_cleanup" ] && exit } @@ -163,10 +183,11 @@ load_configs() { load_configs COMPUTE_BENCH_COMPILE_FLAGS="" +CACHE_RESULTS="0" TIMESTAMP="$(date +"$TIMESTAMP_FORMAT")" # CLI overrides to configuration options -while getopts "p:b:r:f:cC" opt; do +while getopts "p:b:r:f:cCs" opt; do case $opt in p) COMPUTE_BENCH_PATH=$OPTARG ;; r) COMPUTE_BENCH_GIT_REPO=$OPTARG ;; @@ -176,6 +197,7 @@ while getopts "p:b:r:f:cC" opt; do # performing cleanup c) _cleanup=1 ;; C) _cleanup=1 && _exit_after_cleanup=1 ;; + s) CACHE_RESULTS="1";; \?) usage ;; esac done @@ -189,4 +211,5 @@ fi [ ! -d "$PERF_RES_PATH" ] && clone_perf_res [ ! -d "$COMPUTE_BENCH_PATH" ] && clone_compute_bench [ ! -d "$COMPUTE_BENCH_PATH/build" ] && build_compute_bench -process_benchmarks \ No newline at end of file +process_benchmarks +process_results \ No newline at end of file diff --git a/devops/scripts/benchmarking/common.py b/devops/scripts/benchmarking/common.py index 56e5a2ffa166e..b30bf82301639 100644 --- a/devops/scripts/benchmarking/common.py +++ b/devops/scripts/benchmarking/common.py @@ -3,6 +3,7 @@ import ast PERF_RES_PATH, metrics_variance, metrics_recorded = None, None, None +BENCHMARK_SLOW_LOG, BENCHMARK_ERROR_LOG = None, None def sanitize(stat: str) -> float: # Get rid of % @@ -22,9 +23,12 @@ def load_configs(): raise Exception(f"Please provide path to a valid BENCHMARKING_ROOT.") global PERF_RES_PATH, metrics_variance, metrics_recorded + global BENCHMARK_ERROR_LOG, BENCHMARK_SLOW_LOG perf_res_re = re.compile(r'^PERF_RES_PATH=(.*)$', re.M) m_variance_re = re.compile(r'^METRICS_VARIANCE=(.*)$', re.M) m_recorded_re = re.compile(r'^METRICS_RECORDED=(.*)$', re.M) + b_slow_re = re.compile(r'^BENCHMARK_SLOW_LOG=(.*)$', re.M) + b_error_re = re.compile(r'^BENCHMARK_ERROR_LOG=(.*)$', re.M) with open(benchmarking_ci_conf_path, 'r') as configs_file: configs_str = configs_file.read() @@ -41,6 +45,12 @@ def load_configs(): for perf_res in perf_res_re.findall(configs_str): PERF_RES_PATH = str(perf_res[1:-1]) + + for b_slow_log in b_slow_re.findall(configs_str): + BENCHMARK_SLOW_LOG = str(b_slow_log[1:-1]) + + for b_error_log in b_error_re.findall(configs_str): + BENCHMARK_ERROR_LOG = str(b_error_log[1:-1]) def valid_timestamp(timestamp: str) -> bool: diff --git a/devops/scripts/benchmarking/compare.py b/devops/scripts/benchmarking/compare.py index e2e9b12b0a8e4..c0565cfea1b65 100644 --- a/devops/scripts/benchmarking/compare.py +++ b/devops/scripts/benchmarking/compare.py @@ -18,7 +18,7 @@ def compare_to_median(test_name: str, test_csv_path: str): exit(-1) median = dict() - with open(median_path, mode='r') as median_csv: + with open(median_path, 'r') as median_csv: for stat in csv.DictReader(median_csv): median[stat["TestCase"]] = \ { metric: float(stat[metric]) for metric in common.metrics_variance } @@ -26,7 +26,7 @@ def compare_to_median(test_name: str, test_csv_path: str): # TODO read status codes from a config file status = 0 failure_counts = { metric: 0 for metric in common.metrics_variance } - with open(test_csv_path, mode='r') as sample_csv: + with open(test_csv_path, 'r') as sample_csv: for sample in csv.DictReader(sample_csv): # Ignore test cases we haven't profiled before if sample["TestCase"] not in median: @@ -37,8 +37,13 @@ def compare_to_median(test_name: str, test_csv_path: str): if common.sanitize(sample[metric]) > max_tolerated: print("vvv FAILED vvv") print(sample['TestCase']) - print(f"{metric}: {metric} {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tolerance {threshold*100}% -- {max_tolerated})") + print(f"{metric}: {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tolerance {threshold*100}%: {max_tolerated})") print("^^^^^^^^^^^^^^") + with open(common.BENCHMARK_SLOW_LOG, 'a') as slow_log: + slow_log.write( + f"-- {test_name}::{sample['TestCase']}\n" + f" {metric}: {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tol. {threshold*100}%: {max_tolerated})\n" + ) status = 1 failure_counts[metric] += 1 if status != 0: