Triton benchmarks #908

Workflow file for this run

.github/workflows/triton-benchmarks.yml at 161f980

	name: Triton benchmarks
	run-name: ${{ inputs.run_name }}

	on:
	workflow_dispatch:
	inputs:
	runner_label:
	description: Runner label, keep empty for default
	type: string
	default: ""
	tag:
	description: Tag for benchmark results
	type: string
	default: "test"
	benchmarking_method:
	description: The method used to obtain performance numbers
	type: choice
	options:
	- PYTORCH_LEGACY_PROFILER_USING_IPEX
	- ELAPSED_TIME
	- UPSTREAM_PYTORCH_PROFILER
	default: PYTORCH_LEGACY_PROFILER_USING_IPEX
	run_name:
	description: Run name
	type: string
	default: "Triton benchmarks"
	skip_benchmarks:
	description: JSON list of benchmarks to skip
	type: string
	default: "[]"
	schedule:
	- cron: "5 23 * * *"
	pull_request:
	branches:
	- main
	paths:
	- .github/workflows/triton-benchmarks.yml
	- benchmarks/**

	permissions: read-all

	env:
	PYTHON_VERSION: "3.10"
	BENCHMARKING_METHOD: ${{ inputs.benchmarking_method \|\| 'PYTORCH_LEGACY_PROFILER_USING_IPEX' }}
	USE_IPEX: ${{ github.event_name != 'workflow_dispatch' && '1' \|\| inputs.benchmarking_method == 'PYTORCH_LEGACY_PROFILER_USING_IPEX' && '1' \|\| '0' }}
	TAG: ${{ inputs.tag \|\| (github.event_name == 'pull_request' && 'pr') \|\| (github.event_name == 'schedule' && 'ci') \|\| 'test' }}

	jobs:
	build:
	name: Triton benchmarks
	runs-on:
	- ${{ inputs.runner_label \|\| 'max1550' }}
	timeout-minutes: 720
	defaults:
	run:
	shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Load pip cache
	id: pip-cache
	uses: ./.github/actions/load
	with:
	path: $HOME/.cache/pip
	# pip cache per commit id just to minimize network traffic
	key: pip-$PYTHON_VERSION-$GITHUB_SHA

	- name: Install Python
	uses: actions/setup-python@v5
	with:
	python-version: ${{ env.PYTHON_VERSION }}

	- name: Install Python build dependencies
	run: \|
	pip install wheel cmake

	- name: Setup PyTorch with IPEX
	if: ${{ env.USE_IPEX == '1' }}
	uses: ./.github/actions/setup-pytorch
	with:
	repository: Stonepia/pytorch

	- name: Setup PyTorch without IPEX
	if: ${{ env.USE_IPEX == '0' }}
	uses: ./.github/actions/setup-pytorch
	with:
	repository: pytorch/pytorch

	- name: Setup IPEX
	if: ${{ env.USE_IPEX == '1' }}
	uses: ./.github/actions/setup-ipex

	- name: Build Triton wheels
	uses: ./.github/actions/setup-triton
	with:
	command: DEBUG=1 python setup.py bdist_wheel

	- name: Install Triton
	run: \|
	pip install python/dist/*.whl

	- name: Install benchmark dependencies
	run: \|
	pip install matplotlib pandas tabulate

	- name: Create reports dir
	run: \|
	mkdir reports
	echo "REPORTS=$PWD/reports" >> $GITHUB_ENV

	- name: Install benchmarks
	id: install
	run: \|
	cd benchmarks
	python setup.py install

	- name: Run Triton Softmax kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'fused_softmax.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python fused_softmax.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

	- name: Run Triton GEMM kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python gemm_benchmark.py --reports $REPORTS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv

	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

	- name: Run Triton GEMM kernel benchmark - default path
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py_default') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	# Default path:
	TRITON_INTEL_ADVANCED_PATH=0 \
	TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
	IGC_VISAOptions=" -enableBCR -nolocalra" \
	IGC_DisableLoopUnroll=1 \
	python gemm_benchmark.py --reports $REPORTS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv

	source ../../scripts/capture-hw-details.sh
	TAG="${TAG}-dflt"
	python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM kernel benchmark - advanced path
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py_advanced') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	# Advanced path:
	TRITON_INTEL_ADVANCED_PATH=1 \
	TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
	IGC_VISAOptions=" -enableBCR -nolocalra" \
	IGC_DisableLoopUnroll=1 \
	python gemm_benchmark.py --reports $REPORTS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv

	source ../../scripts/capture-hw-details.sh
	TAG="${TAG}-adv"
	python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM (A@B^t) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py_abt') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
	source ../../scripts/capture-hw-details.sh

	python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG

	- name: Run Triton GEMM (A^t@B) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_benchmark.py_atb') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
	mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
	source ../../scripts/capture-hw-details.sh

	python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG

	- name: Run Triton GEMM (stream-k) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_streamk_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python gemm_streamk_benchmark.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM (split-k) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_splitk_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python gemm_splitk_benchmark.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM + PreOp (exp) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_preop_exp_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python gemm_preop_exp_benchmark.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_postop_gelu_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python gemm_postop_gelu_benchmark.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'gemm_postop_addmatrix_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton FA kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flash_attention_fwd_benchmark.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python flash_attention_fwd_benchmark.py --reports $REPORTS

	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
	python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG

	- name: Run Triton FA kernel benchmark - default path
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark \|\| '[]'), 'flash_attention_fwd_benchmark.py_default') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	TRITON_INTEL_ADVANCED_PATH=0 \
	TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
	IGC_VISAOptions=" -enableBCR" \
	python flash_attention_fwd_benchmark.py --reports $REPORTS

	TAG="${TAG}-dflt"
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Triton FA kernel benchmark - advanced path
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'flash_attention_fwd_benchmark.py_advanced') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	TRITON_INTEL_ADVANCED_PATH=1 \
	TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
	IGC_VISAOptions=" -enableBCR" \
	python flash_attention_fwd_benchmark.py --reports $REPORTS

	TAG="${TAG}-adv"
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run Prefix Sums kernel benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'prefix_sums.py') }}
	run: \|
	cd benchmarks/triton_kernels_benchmark
	python prefix_sums.py --reports $REPORTS
	source ../../scripts/capture-hw-details.sh
	python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG

	- name: Run micro benchmark
	if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks \|\| '[]'), 'micro_benchmarks') }}
	run: \|
	cd benchmarks/micro_benchmarks
	python run_benchmarks.py --reports $REPORTS

	- name: Save pip cache
	if: ${{ steps.pip-cache.outputs.status == 'miss' }}
	uses: ./.github/actions/save
	with:
	path: ${{ steps.pip-cache.outputs.path }}
	dest: ${{ steps.pip-cache.outputs.dest }}

	- name: Upload benchmark reports
	if: ${{ steps.install.outcome == 'success' && !cancelled() }}
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-reports
	path: reports

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Triton benchmarks #908

Workflow file

Triton benchmarks #908

Jobs

Run details

Workflow file for this run