From 940e3beff55e1077e0f8a10792a2ac5eb92da27b Mon Sep 17 00:00:00 2001
From: "Li, Ian" <ian.li@intel.com>
Date: Thu, 5 Sep 2024 23:21:48 -0700
Subject: [PATCH] Complete redo of workflow, switch to compute-benchmarks

---
 .github/workflows/sycl-linux-run-tests.yml    |   2 +-
 devops/scripts/benchmarking/aggregate.py      |  70 ++++++++
 devops/scripts/benchmarking/benchmark-ci.conf |  26 +++
 devops/scripts/benchmarking/benchmark.sh      | 170 ++++++++++++++++++
 devops/scripts/benchmarking/common.py         |  43 +++++
 devops/scripts/benchmarking/compare.py        |  43 +++++
 .../scripts/benchmarking/enabled_tests.conf   |   3 +
 devops/scripts/sycl-bench.sh                  | 105 -----------
 8 files changed, 356 insertions(+), 106 deletions(-)
 create mode 100644 devops/scripts/benchmarking/aggregate.py
 create mode 100644 devops/scripts/benchmarking/benchmark-ci.conf
 create mode 100755 devops/scripts/benchmarking/benchmark.sh
 create mode 100644 devops/scripts/benchmarking/common.py
 create mode 100644 devops/scripts/benchmarking/compare.py
 create mode 100644 devops/scripts/benchmarking/enabled_tests.conf
 delete mode 100755 devops/scripts/sycl-bench.sh

diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
index 48200bfb2748c..089b6020d2577 100644
--- a/.github/workflows/sycl-linux-run-tests.yml
+++ b/.github/workflows/sycl-linux-run-tests.yml
@@ -367,7 +367,7 @@ jobs:
     - name: Run sycl-bench microbenchmarks
       id: run_benchmarks
       if: inputs.tests_selector == 'benchmark'
-      run: ./devops/scripts/sycl-bench.sh https://github.com/ianayl/sycl-bench
+      run: ./devops/scripts/benchmarking/benchmark.sh
     - name: Upload sycl-bench microbenchmark results
       if: inputs.tests_selector == 'benchmark' && steps.run_benchmarks.outcome == 'success'
       uses: actions/upload-artifact@v4
diff --git a/devops/scripts/benchmarking/aggregate.py b/devops/scripts/benchmarking/aggregate.py
new file mode 100644
index 0000000000000..95fd21964d896
--- /dev/null
+++ b/devops/scripts/benchmarking/aggregate.py
@@ -0,0 +1,70 @@
+import csv
+import sys
+from pathlib import Path
+import heapq
+
+import common
+
+class StreamingMedian:
+	
+    def __init__(self):
+        self.minheap_larger = []
+        self.maxheap_smaller = []
+		# Note: numbers on maxheap should be negative, as heapq
+        # is minheap by default
+
+    def add(self, n: float):
+        if len(self.maxheap_smaller) == 0 or -self.maxheap_smaller[0] >= n:
+            heapq.heappush(self.maxheap_smaller, -n)
+        else:
+            heapq.heappush(self.minheap_larger, n)
+
+        if len(self.maxheap_smaller) > len(self.minheap_larger) + 1:
+            heapq.heappush(self.minheap_larger,
+						   -heapq.heappop(self.maxheap_smaller))
+        elif len(self.maxheap_smaller) < len(self.minheap_larger):
+            heapq.heappush(self.maxheap_smaller,
+						   -heapq.heappop(self.minheap_larger))
+
+    def get_median(self) -> float:
+        if len(self.maxheap_smaller) == len(self.minheap_larger):
+            return (-self.maxheap_smaller[0] + self.minheap_larger[0]) / 2.0
+        else:
+            return -self.maxheap_smaller[0]
+
+
+def aggregate_median(benchmark: str):
+
+	def csv_samples() -> list[str]:
+		# TODO check that the path below is valid directory
+		with Path(f"{common.PERF_RES_PATH}/{benchmark}") as cache_dir:
+			# TODO check for time range; What time range do I want?
+			return filter(lambda f: f.is_file(),
+						  cache_dir.glob(f"{benchmark}-*.csv"))
+	
+	# Calculate median of every desired metric:
+	aggregate_s = dict()
+	for sample_path in csv_samples():
+		with open(sample_path, mode='r') as sample_file:
+			for s in csv.DictReader(sample_file):
+				if s["TestCase"] not in aggregate_s:
+					aggregate_s[s["TestCase"]] = \
+				 		{ metric: StreamingMedian() for metric in common.metrics_variance }
+				for metric in common.metrics_variance:
+					aggregate_s[s["TestCase"]][metric].add(common.sanitize(s[metric]))
+
+	with open(f"{common.PERF_RES_PATH}/{benchmark}/{benchmark}-median.csv", 'w') as output_csv:
+		writer = csv.DictWriter(output_csv,
+							    fieldnames=["TestCase", *common.metrics_variance.keys()])
+		writer.writeheader()
+		for test_case in aggregate_s:
+			writer.writerow({ "TestCase": test_case } | 
+				{ metric: aggregate_s[test_case][metric].get_median() 
+					for metric in common.metrics_variance })
+	
+		
+if __name__ == "__main__":
+	if len(sys.argv) < 2:
+		print(f"Usage: {sys.argv[0]} <test case name>")
+		exit()
+	aggregate_median(sys.argv[1])
diff --git a/devops/scripts/benchmarking/benchmark-ci.conf b/devops/scripts/benchmarking/benchmark-ci.conf
new file mode 100644
index 0000000000000..679b93604b9d0
--- /dev/null
+++ b/devops/scripts/benchmarking/benchmark-ci.conf
@@ -0,0 +1,26 @@
+# Git branch settings for llvm-ci-perf-results
+PERF_RES_GIT_REPO="https://github.com/intel-sandbox/llvm-ci-perf-results"
+PERF_RES_BRANCH="test-compute-bench"
+# Path where llvm-ci-perf-results are cloned
+PERF_RES_PATH="./llvm-ci-perf-res"
+
+# Git branch settings for compute-benchmarks
+COMPUTE_BENCH_GIT_REPO="https://github.com/ianayl/compute-benchmarks"
+COMPUTE_BENCH_BRANCH="update-sycl"
+
+# Path to compile and build compute-benchmarks
+COMPUTE_BENCH_PATH="./compute-benchmarks"
+
+# Path to temporarily store compute-benchmark results
+OUTPUT_PATH="."
+
+# Metrics to benchmark, and their allowed variance as a Python dictionary 
+METRICS_VARIANCE='{"Median": 0.5}'
+#METRICS_VARIANCE='{"Median": 0.5, "StdDev": 4.0}'
+
+# Metrics to record using aggregate.py
+METRICS_RECORDED='["Median", "StdDev"]'
+
+# Threshold to store benchmark files before benchmarking
+AVERAGE_THRESHOLD=7
+# TODO reconsider this
\ No newline at end of file
diff --git a/devops/scripts/benchmarking/benchmark.sh b/devops/scripts/benchmarking/benchmark.sh
new file mode 100755
index 0000000000000..66c75fcdd8d80
--- /dev/null
+++ b/devops/scripts/benchmarking/benchmark.sh
@@ -0,0 +1,170 @@
+#!/bin/sh
+
+#
+# benchmark.sh: Benchmark dpcpp using compute-benchmarks
+#
+
+# TODO fix
+usage () {
+    >&2 echo "Usage: $0 <compute-benchmarks git repo> [-B <compute-benchmarks build path>]
+  -B  Path to clone and build compute-benchmarks on
+
+This script builds and runs benchmarks from compute-benchmarks."
+    exit 1
+}
+
+clone_perf_res() {
+    echo "### Cloning llvm-ci-perf-res ($PERF_RES_GIT_REPO:$PERF_RES_BRANCH) ###"
+    mkdir -p "$(dirname $PERF_RES_PATH)"
+    git clone -b $PERF_RES_BRANCH $PERF_RES_GIT_REPO $PERF_RES_PATH
+    [ "$?" -ne 0 ] && exit $? 
+}
+
+clone_compute_bench() {
+    echo "### Cloning compute-benchmarks ($COMPUTE_BENCH_GIT_REPO:$COMPUTE_BENCH_BRANCH) ###"
+    mkdir -p "$(dirname $COMPUTE_BENCH_PATH)"
+    git clone -b $COMPUTE_BENCH_BRANCH \
+              --recurse-submodules $COMPUTE_BENCH_GIT_REPO \
+              $COMPUTE_BENCH_PATH
+    [ "$?" -ne 0 ] && exit $? 
+}
+
+build_compute_bench() {
+    echo "### Building compute-benchmarks ($COMPUTE_BENCH_GIT_REPO:$COMPUTE_BENCH_BRANCH) ###"
+    mkdir $COMPUTE_BENCH_PATH/build && cd $COMPUTE_BENCH_PATH/build &&
+    cmake .. -DBUILD_SYCL=ON && cmake --build .
+    compute_bench_build_stat=$?
+    cd -
+    [ "$compute_bench_build_stat" -ne 0 ] && exit $compute_bench_build_stat 
+}
+
+print_bench_res() {
+    # Usage: print_bench_res <benchmark output .csv file> <benchmark status code> <summary file>
+    if [ ! -s $1 ]; then
+        printf "NO OUTPUT! (Status $2)\n" | tee -a $3
+        return  # Do not proceed if file is empty
+    fi
+    
+    get_csv_col_index $1 run-time-mean
+    tmp_run_time_mean_i=$tmp_csv_col_i
+    get_csv_col_index $1 run-time-median
+    tmp_run_time_median_i=$tmp_csv_col_i
+    get_csv_col_index $1 run-time-throughput
+    tmp_run_time_throughput_i=$tmp_csv_col_i
+
+    # `sycl-bench` output seems to like inserting the header multiple times.
+    # Here we cache the header to make sure it prints only once:
+    tmp_header_title="$(cat $1 | head -n 1 | sed 's/^\# Benchmark name/benchmark/')"
+    tmp_result="$(cat $1 | grep '^[^\#]')"
+
+    printf "%s\n%s" "$tmp_header_title" "$tmp_result"                  \
+        | awk -F',' -v me="$tmp_run_time_mean_i"                       \
+                    -v md="$tmp_run_time_median_i"                     \
+                    -v th="$tmp_run_time_throughput_i"                 \
+            '{printf "%-57s %-13s %-15s %-20s\n", $1, $me, $md, $th }' \
+        | tee -a $3   # Print to summary file
+}
+
+###
+STATUS_SUCCESS=0
+STATUS_FAILED=1
+###
+
+samples_under_threshold () {
+    mkdir -p $1
+    file_count="$(find $1 -maxdepth 1 -type f | wc -l )"
+    [ "$file_count" -lt "$AVERAGE_THRESHOLD" ]
+}
+
+check_regression() {
+    if samples_under_threshold "$PERF_RES_PATH/$1"; then
+        echo "Not enough samples to construct an average, performance check skipped!"
+        return $STATUS_SUCCESS
+    fi
+    BENCHMARKING_ROOT="$BENCHMARKING_ROOT" python "$BENCHMARKING_ROOT/compare.py" "$1" "$2"
+    return $?
+    # return $STATUS_FAILED
+}
+
+cache() {
+    mv "$2" "$PERF_RES_PATH/$1/"
+}
+
+# Check for a regression, and cache if no regression found
+check_and_cache() {
+    echo "Checking $testcase..."
+    if check_regression $1 $2; then
+        echo "Caching $testcase..."
+        cache $1 $2
+    else
+        echo "Not caching!"
+    fi
+}
+
+process_benchmarks() {
+    TIMESTAMP="$(date '+%Y%m%d_%H%M%S')"
+    mkdir -p "$PERF_RES_PATH"
+    
+    echo "### Running and processing selected benchmarks ###"
+    if [ -z "$TESTS_CONFIG" ]; then
+        echo "Setting tests to run via cli is not currently supported."
+        exit $STATUS_FAILED
+    else
+        # Ignore lines in the test config starting with #'s
+        grep "^[^#]" "$TESTS_CONFIG" | while read -r testcase; do
+            echo "# Running $testcase..."
+            test_csv_output="$OUTPUT_PATH/$testcase-$TIMESTAMP.csv"
+            $COMPUTE_BENCH_PATH/build/bin/$testcase --csv | tail +8 > "$test_csv_output"
+            # The tail +8 filters out initial debug prints not in csv format
+            if [ "$?" -eq 0 ] && [ -s "$test_csv_output" ]; then 
+                check_and_cache $testcase $test_csv_output
+            else
+                echo "ERROR @ $test_case"
+            fi
+        done
+    fi
+}
+
+cleanup() {
+    rm -r $COMPUTE_BENCH_PATH
+}
+
+load_configs() {
+    # This script needs to know where the "BENCHMARKING_ROOT" directory is,
+    # containing all the configuration files and the compare script.
+    #
+    # If this is not provided, this function tries to guess where the files
+    # are based on how the script is called, and verifies that all necessary
+    # configs and scripts are reachable. 
+    [ -z "$BENCHMARKING_ROOT" ] && BENCHMARKING_ROOT="$(dirname $0)"
+
+    BENCHMARK_CI_CONFIG="$BENCHMARKING_ROOT/benchmark-ci.conf"
+    TESTS_CONFIG="$BENCHMARKING_ROOT/enabled_tests.conf"
+    COMPARE_PATH="$BENCHMARKING_ROOT/compare.py"
+
+    for file in "$BENCHMARK_CI_CONFIG" "$TESTS_CONFIG" "$COMPARE_PATH"; do
+        if [ ! -f "$file" ]; then
+            echo "$(basename $file) not found, please provide path to BENCHMARKING_ROOT."
+            exit -1
+        fi
+    done
+
+    . $BENCHMARK_CI_CONFIG
+}
+
+load_configs
+
+# CLI overrides to configuration options
+while getopts "p:b:r:" opt; do
+    case $opt in
+        p) COMPUTE_BENCH_PATH=$OPTARG ;;
+        r) COMPUTE_BENCH_GIT_REPO=$OPTARG ;;
+        b) COMPUTE_BENCH_BRANCH=$OPTARG ;;
+        \?) usage ;;
+    esac
+done
+
+[ ! -d "$PERF_RES_PATH"            ] && clone_perf_res
+[ ! -d "$COMPUTE_BENCH_PATH"       ] && clone_compute_bench
+[ ! -d "$COMPUTE_BENCH_PATH/build" ] && build_compute_bench
+process_benchmarks
\ No newline at end of file
diff --git a/devops/scripts/benchmarking/common.py b/devops/scripts/benchmarking/common.py
new file mode 100644
index 0000000000000..61272db6db618
--- /dev/null
+++ b/devops/scripts/benchmarking/common.py
@@ -0,0 +1,43 @@
+import os
+import re
+import ast
+
+PERF_RES_PATH, metrics_variance, metrics_recorded = None, None, None
+
+def sanitize(stat: str) -> float:
+	# Get rid of %
+	if stat[-1] == '%':
+		stat = stat[:-1]
+	return float(stat)
+
+
+def load_configs():
+    BENCHMARKING_ROOT = os.getenv("BENCHMARKING_ROOT")
+    if BENCHMARKING_ROOT is None:
+        # Try to predict where BENCHMARKING_ROOT is based on executable
+        BENCHMARKING_ROOT = os.path.dirname(os.path.abspath(__file__))
+
+    benchmarking_ci_conf_path = f"{BENCHMARKING_ROOT}/benchmark-ci.conf"
+    if not os.path.isfile(benchmarking_ci_conf_path):
+        raise Exception(f"Please provide path to a valid BENCHMARKING_ROOT.")
+
+    global PERF_RES_PATH, metrics_variance, metrics_recorded
+    perf_res_re   = re.compile(r'^PERF_RES_PATH=(.*)$', re.M)
+    m_variance_re = re.compile(r'^METRICS_VARIANCE=(.*)$', re.M)
+    m_recorded_re = re.compile(r'^METRICS_RECORDED=(.*)$', re.M)
+
+    with open(benchmarking_ci_conf_path, 'r') as configs_file:
+        configs_str = configs_file.read()
+
+        for m_variance in m_variance_re.findall(configs_str):
+            metrics_variance = ast.literal_eval(m_variance.strip()[1:-1])
+            if not isinstance(metrics_variance, dict):
+                raise TypeError("Error in benchmark-ci.conf: METRICS_VARIANCE is not a python dict.")
+
+        for m_recorded in m_recorded_re.findall(configs_str):
+            metrics_recorded = ast.literal_eval(m_recorded.strip()[1:-1])
+            if not isinstance(metrics_recorded, list):
+                raise TypeError("Error in benchmark-ci.conf: METRICS_RECORDED is not a python list.")
+
+        for perf_res in perf_res_re.findall(configs_str):
+            PERF_RES_PATH = str(perf_res[1:-1])
\ No newline at end of file
diff --git a/devops/scripts/benchmarking/compare.py b/devops/scripts/benchmarking/compare.py
new file mode 100644
index 0000000000000..9987938256330
--- /dev/null
+++ b/devops/scripts/benchmarking/compare.py
@@ -0,0 +1,43 @@
+import csv
+import sys
+from pathlib import Path
+
+import common 
+
+# TODO compare_to(metric) instead?
+def compare_to_median(test_name: str, test_csv_path: str):
+	median = dict()
+	with open(f"{common.PERF_RES_PATH}/{test_name}/{test_name}-median.csv", mode='r') as median_csv:
+		for stat in csv.DictReader(median_csv):
+			median[stat["TestCase"]] = \
+					{ metric: float(stat[metric]) for metric in common.metrics_variance }
+
+	# TODO read status codes from a config file
+	status = 0
+	failure_counts = { metric: 0 for metric in common.metrics_variance }
+	with open(test_csv_path, mode='r') as sample_csv:
+		for sample in csv.DictReader(sample_csv):
+			# Ignore test cases we haven't profiled before
+			if sample["TestCase"] not in median:
+				continue
+			test_median = median[sample["TestCase"]]
+			for metric, threshold in common.metrics_variance.items():
+				max_tolerated = test_median[metric] * (1 + threshold)
+				if common.sanitize(sample[metric]) >  max_tolerated:
+					print("vvv FAILED vvv")
+					print(sample['TestCase'])
+					print(f"{metric}: {metric} {common.sanitize(sample[metric])} -- Historic avg. {test_median[metric]} (max tolerance {threshold*100}% -- {max_tolerated})")
+					print("^^^^^^^^^^^^^^")
+					status = 1
+					failure_counts[metric] += 1
+	if status != 0:
+		print(f"Failure counts: {failure_counts}")
+	return status
+
+
+if __name__ == "__main__":
+	if len(sys.argv) < 3:
+		print(f"Usage: {sys.argv[0]} <test name> <test csv path>")
+		exit(-1)
+	common.load_configs()
+	exit(compare_to_median(sys.argv[1], sys.argv[2]))
diff --git a/devops/scripts/benchmarking/enabled_tests.conf b/devops/scripts/benchmarking/enabled_tests.conf
new file mode 100644
index 0000000000000..7aaec4919a416
--- /dev/null
+++ b/devops/scripts/benchmarking/enabled_tests.conf
@@ -0,0 +1,3 @@
+# Test cases to be enabled:
+api_overhead_benchmark_sycl
+memory_benchmark_sycl
diff --git a/devops/scripts/sycl-bench.sh b/devops/scripts/sycl-bench.sh
deleted file mode 100755
index 4b00a60f178aa..0000000000000
--- a/devops/scripts/sycl-bench.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/sh
-
-# sycl-bench.sh: Benchmark dpcpp using sycl-bench
-
-usage () {
-    >&2 echo "Usage: $0 <sycl-bench git repo> [-B <sycl-bench build path>]
-  -B  Path to clone and build sycl-bench on
-
-This script builds and runs benchmarks from sycl-bench."
-    exit 1
-}
-
-clone() {
-    mkdir -p $SYCL_BENCH_PATH
-    git clone $SYCL_BENCH_GIT_REPO $SYCL_BENCH_PATH || return $?
-}
-
-build() {
-    cd $SYCL_BENCH_PATH
-    cmake -DSYCL_IMPL=dpcpp -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=./bin -S . -B ./build &&
-    cmake --build ./build || return $?
-    cd -
-}
-
-get_csv_col_index() {
-    # Determine the index of a column in a CSV given its title
-    # Usage: get_csv_col_index <benchmark output .csv file> <column name>
-    tmp_csv_col_i="$(cat "$1" | head -n 1 | grep -o "^.*$2," | grep -o ',' | wc -l)"
-}
-
-print_bench_res() {
-    # Usage: print_bench_res <benchmark output .csv file> <benchmark status code> <summary file>
-    if [ ! -s $1 ]; then
-        printf "NO OUTPUT! (Status $2)\n" | tee -a $3
-        return  # Do not proceed if file is empty
-    fi
-    
-    get_csv_col_index $1 run-time-mean
-    tmp_run_time_mean_i=$tmp_csv_col_i
-    get_csv_col_index $1 run-time-median
-    tmp_run_time_median_i=$tmp_csv_col_i
-    get_csv_col_index $1 run-time-throughput
-    tmp_run_time_throughput_i=$tmp_csv_col_i
-
-    # `sycl-bench` output seems to like inserting the header multiple times.
-    # Here we cache the header to make sure it prints only once:
-    tmp_header_title="$(cat $1 | head -n 1 | sed 's/^\# Benchmark name/benchmark/')"
-    tmp_result="$(cat $1 | grep '^[^\#]')"
-
-    printf "%s\n%s" "$tmp_header_title" "$tmp_result"                  \
-        | awk -F',' -v me="$tmp_run_time_mean_i"                       \
-                    -v md="$tmp_run_time_median_i"                     \
-                    -v th="$tmp_run_time_throughput_i"                 \
-            '{printf "%-57s %-13s %-15s %-20s\n", $1, $me, $md, $th }' \
-        | tee -a $3   # Print to summary file
-}
-
-# run sycl bench step
-run() {
-    TIMESTAMP="$(date '+%Y%m%d_%H%M%S')"
-    mkdir "$SYCL_BENCH_PATH/build/bench-$TIMESTAMP/"
-    tmp_summary_file="$SYCL_BENCH_PATH/build/bench-$TIMESTAMP/summary.txt" 
-
-    for file in $SYCL_BENCH_PATH/build/bin/*; do
-        # TODO -size should not be always 256, caution
-        tmp_bench_output="$SYCL_BENCH_PATH/build/bench-$TIMESTAMP/$(basename $file).csv"
-        tmp_bench_log="$SYCL_BENCH_PATH/build/bench-$TIMESTAMP/$(basename $file).log"
-
-        tmp_err="0"
-        printf "\n### Results for $(basename $file) ###\n" | tee -a $tmp_summary_file
-        # The pipe here suppresses errors in a way that doesn't stop github actions:
-        $file --output=$tmp_bench_output --no-verification --size=256 2> "$tmp_bench_log" || tmp_err=$?
-        print_bench_res $tmp_bench_output $tmp_err $tmp_summary_file
-        # Remove log if nothing logged
-        [ ! -s "$tmp_bench_log" ] && rm "$tmp_bench_log" || cat "$tmp_bench_log" | tee -a $tmp_summary_file
-    done
-
-    # Export timestamp for later use
-    [ -f "$GITHUB_OUTPUT" ] && echo TIMESTAMP=$TIMESTAMP >> $GITHUB_OUTPUT
-}
-
-compress() {
-    tar -I gzip -cf "$SYCL_BENCH_PATH/build/bench-$TIMESTAMP.tar.gz" -C "$SYCL_BENCH_PATH/build/bench-$TIMESTAMP" .
-    if [ -f "$SYCL_BENCH_PATH/build/bench-$TIMESTAMP.tar.gz" ] && [ -f "$GITHUB_OUTPUT" ]; then
-        echo BENCHMARK_RESULTS="$SYCL_BENCH_PATH/build/bench-$TIMESTAMP.tar.gz" >> $GITHUB_OUTPUT
-    fi
-}
-
-cleanup() {
-    rm -r $SYCL_BENCH_PATH
-}
-
-
-[ "$#" -lt "1" ] && usage
-
-SYCL_BENCH_GIT_REPO="$1"; shift
-SYCL_BENCH_PATH="./sycl-bench"
-while getopts "B:" opt; do
-    case $opt in
-        B)  SYCL_BENCH_PATH=$OPTARG ;;
-        \?) usage ;;
-    esac
-done
-
-clone && build && run && compress