From 3b5454fd7f1a6726b4d2edf75dedccb6410aeaad Mon Sep 17 00:00:00 2001
From: "Li, Ian" <ian.li@intel.com>
Date: Wed, 18 Sep 2024 13:02:16 -0700
Subject: [PATCH] Fixed aggregate, improved logging

---
 .../workflows/sycl-benchmark-aggregate.yml    |  2 +-
 .github/workflows/sycl-linux-benchmark.yml    |  7 ++--
 devops/scripts/benchmarking/aggregate.py      |  6 +--
 devops/scripts/benchmarking/benchmark-ci.conf | 13 ++++--
 devops/scripts/benchmarking/benchmark.sh      | 41 +++++++++++++++----
 devops/scripts/benchmarking/common.py         | 10 +++++
 devops/scripts/benchmarking/compare.py        | 11 +++--
 7 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/sycl-benchmark-aggregate.yml b/.github/workflows/sycl-benchmark-aggregate.yml
index d85fd78577491..da20d0186680e 100644
--- a/.github/workflows/sycl-benchmark-aggregate.yml
+++ b/.github/workflows/sycl-benchmark-aggregate.yml
@@ -57,7 +57,7 @@ jobs:
       with:
         # TODO populate default values before loading configuration
         path: ${{ env.PERF_RES_PATH }}
-        repository: ${{ env.PERF_RES_GIT_REPO }}
+        repository: intel-sandbox/llvm-ci-perf-results #${{ env.PERF_RES_GIT_REPO }}
         branch: ${{ env.PERF_RES_BRANCH }}
     - name: Run aggregator on cloned data
       run: |
diff --git a/.github/workflows/sycl-linux-benchmark.yml b/.github/workflows/sycl-linux-benchmark.yml
index 45c728bb56aa6..1a4d4680684cb 100644
--- a/.github/workflows/sycl-linux-benchmark.yml
+++ b/.github/workflows/sycl-linux-benchmark.yml
@@ -51,7 +51,7 @@ on:
       cache_results:
         type: boolean
         default: false
-        required: False
+        required: false
 
   workflow_dispatch:
     inputs:
@@ -109,7 +109,7 @@ on:
       cache_results:
         type: boolean
         default: false
-        required: False
+        required: false
 
 permissions:
   contents: write
@@ -242,8 +242,7 @@ jobs:
         clang++ --version
         ls
         export CMPLR_ROOT=$PWD/toolchain
-        ./devops/scripts/benchmarking/benchmark.sh
-        exit $?
+        ./devops/scripts/benchmarking/benchmark.sh ${{ inputs.cache_results == true && '-s' }}
     - name: debug -- delete after
       env:
         SSH_KEY: ${{secrets.ACTIONS_DEPLOY_KEY}}
diff --git a/devops/scripts/benchmarking/aggregate.py b/devops/scripts/benchmarking/aggregate.py
index 771ae56136f8e..80b5e81f8ea47 100644
--- a/devops/scripts/benchmarking/aggregate.py
+++ b/devops/scripts/benchmarking/aggregate.py
@@ -40,13 +40,13 @@ def csv_samples() -> list[str]:
 		with Path(f"{common.PERF_RES_PATH}/{benchmark}") as cache_dir:
 			# TODO check for time range; What time range do I want?
 			return filter(lambda f: f.is_file() and
-						  common.valid_timestamp(str(f)[-13:]) and str(f)[-13:] > cutoff,
+						  common.valid_timestamp(str(f)[-17:-4]) and str(f)[-17:-4] > cutoff,
 						  cache_dir.glob(f"{benchmark}-*_*.csv"))
 	
 	# Calculate median of every desired metric:
 	aggregate_s = dict()
 	for sample_path in csv_samples():
-		with open(sample_path, mode='r') as sample_file:
+		with open(sample_path, 'r') as sample_file:
 			for s in csv.DictReader(sample_file):
 				if s["TestCase"] not in aggregate_s:
 					aggregate_s[s["TestCase"]] = \
@@ -71,5 +71,5 @@ def csv_samples() -> list[str]:
 	if not common.valid_timestamp(sys.argv[2]):
 		print(f"Bad cutoff timestamp, please use YYMMDD_HHMMSS.")
 		exit(1)
-
+	common.load_configs()
 	aggregate_median(sys.argv[1], sys.argv[2])
diff --git a/devops/scripts/benchmarking/benchmark-ci.conf b/devops/scripts/benchmarking/benchmark-ci.conf
index b08d82fcad1a5..adf023d9a1a2a 100644
--- a/devops/scripts/benchmarking/benchmark-ci.conf
+++ b/devops/scripts/benchmarking/benchmark-ci.conf
@@ -1,5 +1,5 @@
 # Git branch settings for llvm-ci-perf-results
-PERF_RES_GIT_REPO="https://github.com/ianayl/llvm-ci-perf-results"
+PERF_RES_GIT_REPO="https://github.com/intel-sandbox/llvm-ci-perf-results"
 PERF_RES_BRANCH="test-compute-bench"
 # Path where llvm-ci-perf-results are cloned
 PERF_RES_PATH="./llvm-ci-perf-res"
@@ -17,7 +17,7 @@ COMPUTE_BENCH_COMPILE_FLAGS="-j2"
 OUTPUT_PATH="."
 
 # Metrics to benchmark, and their allowed variance as a Python dictionary 
-METRICS_VARIANCE='{"Median": 0.5}'
+METRICS_VARIANCE='{"Median": 0.15}'
 #METRICS_VARIANCE='{"Median": 0.5, "StdDev": 4.0}'
 
 # Metrics to record using aggregate.py
@@ -25,9 +25,14 @@ METRICS_RECORDED='["Median", "StdDev"]'
 
 # Threshold to store benchmark files before benchmarking
 # TODO reconsider this
-AVERAGE_THRESHOLD=7
+AVERAGE_THRESHOLD=3
 # Default period of time to aggregate for the average
 AVERAGE_CUTOFF_RANGE="7 days ago"
 
 # Format of timestamps used (linux `date` format string)
-TIMESTAMP_FORMAT='%Y%m%d_%H%M%S'
\ No newline at end of file
+TIMESTAMP_FORMAT='%Y%m%d_%H%M%S'
+
+# Log file for test cases that perform over the allowed variance
+BENCHMARK_SLOW_LOG="./benchmarks-over_tolerance.log"
+# Log file for test cases that errored / failed to build
+BENCHMARK_ERROR_LOG="./benchmarks-errored.log"
\ No newline at end of file
diff --git a/devops/scripts/benchmarking/benchmark.sh b/devops/scripts/benchmarking/benchmark.sh
index 5ad82f3c5414a..7fbabda307072 100755
--- a/devops/scripts/benchmarking/benchmark.sh
+++ b/devops/scripts/benchmarking/benchmark.sh
@@ -73,7 +73,7 @@ build_compute_bench() {
 
 ###
 STATUS_SUCCESS=0
-STATUS_FAILED=1
+STATUS_ERROR=1
 ###
 
 samples_under_threshold () {
@@ -89,7 +89,6 @@ check_regression() {
     fi
     BENCHMARKING_ROOT="$BENCHMARKING_ROOT" python "$BENCHMARKING_ROOT/compare.py" "$1" "$2"
     return $?
-    # return $STATUS_FAILED
 }
 
 cache() {
@@ -100,10 +99,14 @@ cache() {
 check_and_cache() {
     echo "Checking $testcase..."
     if check_regression $1 $2; then
-        echo "Caching $testcase..."
-        cache $1 $2
+        if [ "$CACHE_RESULTS" -eq "1" ]; then
+            echo "Caching $testcase..."
+            cache $1 $2
+        fi
     else
-        echo "Not caching!"
+        if [ "$CACHE_RESULTS" -eq "1" ]; then
+            echo "Not caching!"
+        fi
     fi
 }
 
@@ -113,8 +116,9 @@ process_benchmarks() {
     echo "### Running and processing selected benchmarks ###"
     if [ -z "$TESTS_CONFIG" ]; then
         echo "Setting tests to run via cli is not currently supported."
-        exit $STATUS_FAILED
+        exit $STATUS_ERROR
     else
+        rm "$BENCHMARK_ERROR_LOG" "$BENCHMARK_SLOW_LOG" 2> /dev/null
         # Ignore lines in the test config starting with #'s
         grep "^[^#]" "$TESTS_CONFIG" | while read -r testcase; do
             echo "# Running $testcase..."
@@ -124,16 +128,32 @@ process_benchmarks() {
             if [ "$?" -eq 0 ] && [ -s "$test_csv_output" ]; then 
                 check_and_cache $testcase $test_csv_output
             else
+                # TODO consider capturing error for logging
                 echo "ERROR @ $test_case"
+                echo "-- $testcase: error $?" >> "$BENCHMARK_ERROR_LOG"
             fi
         done
     fi
 }
 
+process_results() {
+    if [ -s "$BENCHMARK_SLOW_LOG" ]; then
+        printf "\n### Tests performing over acceptable range of average: ###\n"
+        cat "$BENCHMARK_SLOW_LOG"
+        echo ""
+    fi
+    if [ -s "$BENCHMARK_ERROR_LOG" ]; then
+        printf "\n### Tests that failed to run: ###\n"
+        cat "$BENCHMARK_ERROR_LOG"
+        echo ""
+    fi
+    [ ! -s "$BENCHMARKING_SLOW_LOG" ] && [ ! -s "$BENCHMARK_ERROR_LOG" ]
+}
+
 cleanup() {
     echo "### Cleaning up compute-benchmark builds from prior runs ###"
     rm -rf $COMPUTE_BENCH_PATH
-    #rm -rf $PERF_RES_PATH
+    rm -rf $PERF_RES_PATH
     [ ! -z "$_exit_after_cleanup" ] && exit
 }
 
@@ -163,10 +183,11 @@ load_configs() {
 load_configs
 
 COMPUTE_BENCH_COMPILE_FLAGS=""
+CACHE_RESULTS="0"
 TIMESTAMP="$(date +"$TIMESTAMP_FORMAT")"
 
 # CLI overrides to configuration options
-while getopts "p:b:r:f:cC" opt; do
+while getopts "p:b:r:f:cCs" opt; do
     case $opt in
         p) COMPUTE_BENCH_PATH=$OPTARG ;;
         r) COMPUTE_BENCH_GIT_REPO=$OPTARG ;;
@@ -176,6 +197,7 @@ while getopts "p:b:r:f:cC" opt; do
         # performing cleanup
         c) _cleanup=1 ;;
         C) _cleanup=1 && _exit_after_cleanup=1 ;;
+        s) CACHE_RESULTS="1";;
         \?) usage ;;
     esac
 done
@@ -189,4 +211,5 @@ fi
 [ ! -d "$PERF_RES_PATH"            ] && clone_perf_res
 [ ! -d "$COMPUTE_BENCH_PATH"       ] && clone_compute_bench
 [ ! -d "$COMPUTE_BENCH_PATH/build" ] && build_compute_bench
-process_benchmarks
\ No newline at end of file
+process_benchmarks
+process_results
\ No newline at end of file
diff --git a/devops/scripts/benchmarking/common.py b/devops/scripts/benchmarking/common.py
index 56e5a2ffa166e..b30bf82301639 100644
--- a/devops/scripts/benchmarking/common.py
+++ b/devops/scripts/benchmarking/common.py
@@ -3,6 +3,7 @@
 import ast
 
 PERF_RES_PATH, metrics_variance, metrics_recorded = None, None, None
+BENCHMARK_SLOW_LOG, BENCHMARK_ERROR_LOG = None, None
 
 def sanitize(stat: str) -> float:
 	# Get rid of %
@@ -22,9 +23,12 @@ def load_configs():
         raise Exception(f"Please provide path to a valid BENCHMARKING_ROOT.")
 
     global PERF_RES_PATH, metrics_variance, metrics_recorded
+    global BENCHMARK_ERROR_LOG, BENCHMARK_SLOW_LOG
     perf_res_re   = re.compile(r'^PERF_RES_PATH=(.*)$', re.M)
     m_variance_re = re.compile(r'^METRICS_VARIANCE=(.*)$', re.M)
     m_recorded_re = re.compile(r'^METRICS_RECORDED=(.*)$', re.M)
+    b_slow_re     = re.compile(r'^BENCHMARK_SLOW_LOG=(.*)$', re.M)
+    b_error_re    = re.compile(r'^BENCHMARK_ERROR_LOG=(.*)$', re.M)
 
     with open(benchmarking_ci_conf_path, 'r') as configs_file:
         configs_str = configs_file.read()
@@ -41,6 +45,12 @@ def load_configs():
 
         for perf_res in perf_res_re.findall(configs_str):
             PERF_RES_PATH = str(perf_res[1:-1])
+
+        for b_slow_log in b_slow_re.findall(configs_str):
+            BENCHMARK_SLOW_LOG = str(b_slow_log[1:-1])
+
+        for b_error_log in b_error_re.findall(configs_str):
+            BENCHMARK_ERROR_LOG = str(b_error_log[1:-1])
         
 
 def valid_timestamp(timestamp: str) -> bool:
diff --git a/devops/scripts/benchmarking/compare.py b/devops/scripts/benchmarking/compare.py
index e2e9b12b0a8e4..c0565cfea1b65 100644
--- a/devops/scripts/benchmarking/compare.py
+++ b/devops/scripts/benchmarking/compare.py
@@ -18,7 +18,7 @@ def compare_to_median(test_name: str, test_csv_path: str):
 		exit(-1)
 
 	median = dict()
-	with open(median_path, mode='r') as median_csv:
+	with open(median_path, 'r') as median_csv:
 		for stat in csv.DictReader(median_csv):
 			median[stat["TestCase"]] = \
 					{ metric: float(stat[metric]) for metric in common.metrics_variance }
@@ -26,7 +26,7 @@ def compare_to_median(test_name: str, test_csv_path: str):
 	# TODO read status codes from a config file
 	status = 0
 	failure_counts = { metric: 0 for metric in common.metrics_variance }
-	with open(test_csv_path, mode='r') as sample_csv:
+	with open(test_csv_path, 'r') as sample_csv:
 		for sample in csv.DictReader(sample_csv):
 			# Ignore test cases we haven't profiled before
 			if sample["TestCase"] not in median:
@@ -37,8 +37,13 @@ def compare_to_median(test_name: str, test_csv_path: str):
 				if common.sanitize(sample[metric]) >  max_tolerated:
 					print("vvv FAILED vvv")
 					print(sample['TestCase'])
-					print(f"{metric}: {metric} {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tolerance {threshold*100}% -- {max_tolerated})")
+					print(f"{metric}: {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tolerance {threshold*100}%: {max_tolerated})")
 					print("^^^^^^^^^^^^^^")
+					with open(common.BENCHMARK_SLOW_LOG, 'a') as slow_log:
+						slow_log.write(
+							f"-- {test_name}::{sample['TestCase']}\n"
+							f"   {metric}: {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tol. {threshold*100}%: {max_tolerated})\n"
+						)
 					status = 1
 					failure_counts[metric] += 1
 	if status != 0: