From 3c21be026a2595c57db5d705babe877e2e7d6e19 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 01:12:42 +0000
Subject: [PATCH 01/38] Add benchmark using sglang server, Add sgl server
 benchmark to workflow file, Restructure `app_tests/benchmark_tests`

---
 .github/workflows/ci-sglang-benchmark.yml     | 113 ++++++++++++++++--
 .../llm/sglang_benchmarks/__init__.py         |   5 +
 .../llm/{ => sglang_benchmarks}/conftest.py   |  18 ++-
 .../sglang_benchmark_test.py                  |  66 ++++++++++
 .../shortfin_benchmark_test.py}               |  21 ++--
 .../llm/{ => sglang_benchmarks}/utils.py      |  13 ++
 app_tests/integration_tests/llm/utils.py      |   2 +-
 7 files changed, 210 insertions(+), 28 deletions(-)
 create mode 100644 app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
 rename app_tests/benchmark_tests/llm/{ => sglang_benchmarks}/conftest.py (77%)
 create mode 100644 app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
 rename app_tests/benchmark_tests/llm/{sglang_benchmark_test.py => sglang_benchmarks/shortfin_benchmark_test.py} (87%)
 rename app_tests/benchmark_tests/llm/{ => sglang_benchmarks}/utils.py (84%)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 504e7e5e3..c05d651fd 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -7,10 +7,14 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
+  # TODO: Remove PR trigger after verification
+  pull_request:
   workflow_dispatch:
   schedule:
-    # Weekdays at 4:00 AM UTC = 9:00 PM PST.
-    - cron: "0 4 * * 1-5"
+    # Weekdays at 6:00 AM UTC = 11:00 PM PST.
+    # This is a pretty GPU intensive test, so want to avoid conflicting
+    # with other potentially scheduled tests.
+    - cron: "0 6 * * 1-5"
 
 concurrency:
   # A PR number if a pull request and otherwise the commit hash. This cancels
@@ -21,9 +25,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  sglang_bench_serve:
+  benchmark_shortfin:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-    name: "SGLang Serving Benchmark Tests"
+    name: "SGLang Serving Benchmark With Shortfin"
     strategy:
       matrix:
         version: [3.11]
@@ -77,13 +81,98 @@ jobs:
       - name: Install SGLang
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
-      - name: Launch Shortfin Server
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
+      - name: Run Shortfin Benchmark Tests
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
 
-      - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+      # TODO: Uncomment after verification
+      # - name: Deploy to GitHub Pages
+      #   uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+      #   with:
+      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+      #     publish_dir: ./out/llm/sgl_benchmark/shortfin
+      #     destination_dir: ./llm/sgl_benchmark/shortfin
+      #     keep_files: true
+
+  benchmark_sglang:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "SGLang Serving Benchmark With SGLang"
+    needs: benchmark_shortfin
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: llama-mi300x-3
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
-          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/sglang
-          destination_dir: ./llm/sglang
-          keep_files: true
+          python-version: ${{matrix.version}}
+
+      - name: Install SGLang
+        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # Instruction for SGLang image sourced from here:
+      #   https://sgl-project.github.io/start/install.html#method-3-using-docker
+      # We have to run in a docker container due to their vLLM dependency.
+      # From their pyproject.toml:
+      #   HIP (Heterogeneous-computing Interface for Portability) for AMD
+      #   => base docker rocm/vllm-dev:20241022, not from public vllm whl
+      #   srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
+      - name: Pull SGLang Image (Had issues with sglang:v0.3.5.post1-rocm620)
+        run: |
+          docker pull lmsysorg/sglang:v0.3.5.post1-rocm620
+
+      - name: Run SGLang Server
+        run: |
+          docker run -d --rm  \
+            --device=/dev/kfd \
+            --device=/dev/dri \
+            --ipc=host \
+            --shm-size 16G \
+            --group-add video \
+            --cap-add=SYS_PTRACE \
+            --security-opt seccomp=unconfined \
+            -v $HOME/dockerx:/dockerx \
+            -v /data:/data \
+            -p 30000:30000 \
+            -v ~/.cache/huggingface:/root/.cache/huggingface \
+            --env "HF_TOKEN={{ secrets.HF_TOKEN }}" \
+            lmsysorg/sglang:v0.3.5.post1-rocm620 \
+            python3 -m sglang.launch_server \
+            --model-path meta-llama/Llama-3.1-8b \
+            --host 0.0.0.0 \
+            --port 30000 \
+            --tp 1 \
+            --dtype float16
+
+      - name: Run SGLang Benchmark Tests
+        run: |
+          pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --port 30000 --log-cli-level=INFO --html=out/llm/sglang/index.html
+
+      - name: Stop sglang-server
+        run: docker stop sglang-server || true # Stop container if it's running
+
+      - name: Cleanup SGLang Image
+        run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
+
+      # TODO: Uncomment after verifying
+      # - name: Deploy to GitHub Pages
+      #   uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+      #   with:
+      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+      #     publish_dir: ./out/llm/sgl_benchmark/sglang
+      #     destination_dir: ./llm/sgl_benchmark/sglang
+      #     keep_files: true
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
new file mode 100644
index 000000000..a85ba359d
--- /dev/null
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/benchmark_tests/llm/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
similarity index 77%
rename from app_tests/benchmark_tests/llm/conftest.py
rename to app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
index cc354b7eb..eceaec857 100644
--- a/app_tests/benchmark_tests/llm/conftest.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -9,7 +9,9 @@
 import pytest
 import sys
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+)
 from integration_tests.llm.utils import compile_model, export_paged_llm_v1
 
 
@@ -44,3 +46,17 @@ def pre_process_model(request, tmp_path_factory):
     compile_model(mlir_path, vmfb_path, settings)
 
     return tmp_dir
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--port",
+        action="store",
+        default="30000",
+        help="Port that SGLang server is running on",
+    )
+
+
+@pytest.fixture(scope="module")
+def sglang_args(request):
+    return request.config.getoption("--port")
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
new file mode 100644
index 000000000..fe6b28cee
--- /dev/null
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+from pathlib import Path
+import pytest
+import time
+from unittest.mock import patch
+
+pytest.importorskip("sglang")
+from sglang import bench_serving
+
+from .utils import SGLangBenchmarkArgs, log_jsonl_result
+
+from integration_tests.llm.utils import wait_for_server
+
+logger = logging.getLogger(__name__)
+
+TOKENIZER_DIR = Path("/data/llama3.1/8b/")
+
+
+@pytest.mark.parametrize(
+    "request_rate",
+    [1, 2, 4, 8, 16, 32],
+)
+def test_sglang_benchmark(request_rate, sglang_args, tmp_path_factory):
+    tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
+    logger.info("Beginning SGLang benchmark test...")
+
+    port = sglang_args
+    base_url = f"http://localhost:{port}"
+
+    # Setting a high timeout gives enough time for downloading model artifacts
+    # and starting up server... Takes a little longer than shortfin.
+    wait_for_server(base_url, timeout=600)
+
+    benchmark_args = SGLangBenchmarkArgs(
+        backend="sglang",
+        num_prompt=10,
+        base_url=f"http://localhost:{port}",
+        tokenizer=TOKENIZER_DIR,
+        request_rate=request_rate,
+    )
+    output_file = (
+        tmp_dir
+        / f"{benchmark_args.backend}_{benchmark_args.num_prompt}_{benchmark_args.request_rate}.jsonl"
+    )
+    benchmark_args.output_file = output_file
+
+    logger.info("Running SGLang Benchmark with the following args:")
+    logger.info(benchmark_args)
+
+    try:
+        start = time.time()
+        with patch.object(bench_serving, "print", side_effect=logger.info):
+            bench_serving.run_benchmark(
+                benchmark_args.as_namespace(),
+            )
+        logger.info(f"Benchmark run completed in {str(time.time() - start)} seconds")
+        logger.info("======== RESULTS ========")
+        log_jsonl_result(benchmark_args.output_file)
+    except Exception as e:
+        logger.error(e)
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
similarity index 87%
rename from app_tests/benchmark_tests/llm/sglang_benchmark_test.py
rename to app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
index 0de775795..e9750fa5a 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
@@ -4,7 +4,6 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import json
 import logging
 import multiprocessing
 import os
@@ -16,14 +15,17 @@
 pytest.importorskip("sglang")
 from sglang import bench_serving
 
-from utils import SGLangBenchmarkArgs
+from app_tests.benchmark_tests.llm.sglang_benchmarks.utils import (
+    SGLangBenchmarkArgs,
+    log_jsonl_result,
+)
 
 from integration_tests.llm.utils import (
     find_available_port,
     start_llm_server,
 )
 
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 
 device_settings = {
     "device_flags": [
@@ -38,15 +40,6 @@
 TOKENIZER_DIR = Path("/data/llama3.1/8b/")
 
 
-def log_jsonl_result(file_path):
-    with open(file_path, "r") as file:
-        json_string = file.readline().strip()
-
-    json_data = json.loads(json_string)
-    for key, val in json_data.items():
-        logger.info(f"{key.upper()}: {val}")
-
-
 @pytest.mark.parametrize(
     "request_rate",
     [1, 2, 4, 8, 16, 32],
@@ -64,7 +57,7 @@ def log_jsonl_result(file_path):
     ],
     indirect=True,
 )
-def test_sglang_benchmark_server(request_rate, pre_process_model):
+def test_shortfin_benchmark(request_rate, pre_process_model):
     # TODO: Remove when multi-device is fixed
     os.environ["ROCR_VISIBLE_DEVICES"] = "1"
 
@@ -116,7 +109,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         logger.info("======== RESULTS ========")
         log_jsonl_result(benchmark_args.output_file)
     except Exception as e:
-        logger.info(e)
+        logger.error(e)
 
     server_process.terminate()
     server_process.wait()
diff --git a/app_tests/benchmark_tests/llm/utils.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/utils.py
similarity index 84%
rename from app_tests/benchmark_tests/llm/utils.py
rename to app_tests/benchmark_tests/llm/sglang_benchmarks/utils.py
index 55b01da04..47cea4d76 100644
--- a/app_tests/benchmark_tests/llm/utils.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/utils.py
@@ -6,8 +6,12 @@
 
 from argparse import Namespace
 from dataclasses import dataclass
+import json
+import logging
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class SGLangBenchmarkArgs:
@@ -54,3 +58,12 @@ def __repr__(self):
             f"Tokenizer: {self.tokenizer}\n"
             f"Request Rate: {self.request_rate}"
         )
+
+
+def log_jsonl_result(file_path):
+    with open(file_path, "r") as file:
+        json_string = file.readline().strip()
+
+    json_data = json.loads(json_string)
+    for key, val in json_data.items():
+        logger.info(f"{key.upper()}: {val}")
diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py
index 05712039e..80b5b3c09 100644
--- a/app_tests/integration_tests/llm/utils.py
+++ b/app_tests/integration_tests/llm/utils.py
@@ -15,7 +15,7 @@
 import requests
 from transformers import AutoTokenizer
 
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 
 
 class AccuracyValidationException(RuntimeError):

From 31398a5ce6f0355e5e40abd74e2d046aa8f71b87 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 14:56:26 +0000
Subject: [PATCH 02/38] Fix import path in `shortfin_benchmark_test`,
 Temporarily comment out shortfin job to verify sglang benchmark job

---
 .github/workflows/ci-sglang-benchmark.yml     | 120 +++++++++---------
 .../shortfin_benchmark_test.py                |   2 +-
 2 files changed, 62 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index c05d651fd..5d2125952 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -25,64 +25,65 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  benchmark_shortfin:
-    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-    name: "SGLang Serving Benchmark With Shortfin"
-    strategy:
-      matrix:
-        version: [3.11]
-      fail-fast: false
-    runs-on: llama-mi300x-3
-    defaults:
-      run:
-        shell: bash
-    env:
-      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-    steps:
-      - name: Get Current Date
-        id: date
-        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
-
-      - name: "Setting up Python"
-        id: setup_python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{matrix.version}}
-
-      - name: "Checkout Code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Cache Pip Packages
-        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
-        id: cache-pip
-        with:
-          path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
-
-      - name: Install pip deps
-        run: |
-          python -m pip install --no-compile --upgrade pip
-          # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile -r pytorch-cpu-requirements.txt
-          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
-
-          # Try with the latest nightly releases, not what iree-turbine pins.
-          # We could also pin to a known working or stable version.
-          # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241118 \
-            iree-base-runtime==3.0.0rc20241118 \
-            "numpy<2.0"
-
-      - name: Install SGLang
-        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
-
-      - name: Run Shortfin Benchmark Tests
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
+  # TODO: Uncomment after verification
+  # benchmark_shortfin:
+  #   if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+  #   name: "SGLang Serving Benchmark With Shortfin"
+  #   strategy:
+  #     matrix:
+  #       version: [3.11]
+  #     fail-fast: false
+  #   runs-on: llama-mi300x-3
+  #   defaults:
+  #     run:
+  #       shell: bash
+  #   env:
+  #     PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+  #   steps:
+  #     - name: Get Current Date
+  #       id: date
+  #       run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+  #     - name: "Setting up Python"
+  #       id: setup_python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #         python-version: ${{matrix.version}}
+
+  #     - name: "Checkout Code"
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+  #     - name: Cache Pip Packages
+  #       uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+  #       id: cache-pip
+  #       with:
+  #         path: ${{ env.PIP_CACHE_DIR }}
+  #         key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+  #     - name: Install pip deps
+  #       run: |
+  #         python -m pip install --no-compile --upgrade pip
+  #         # Note: We install in three steps in order to satisfy requirements
+  #         # from non default locations first. Installing the PyTorch CPU
+  #         # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+  #         pip install --no-compile -r pytorch-cpu-requirements.txt
+  #         pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+  #           -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+  #         pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+  #         # Try with the latest nightly releases, not what iree-turbine pins.
+  #         # We could also pin to a known working or stable version.
+  #         # This should eventually stabilize. Do the best we can for now.
+  #         pip install -f https://iree.dev/pip-release-links.html --upgrade \
+  #           iree-base-compiler==3.0.0rc20241118 \
+  #           iree-base-runtime==3.0.0rc20241118 \
+  #           "numpy<2.0"
+
+  #     - name: Install SGLang
+  #       run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+  #     - name: Run Shortfin Benchmark Tests
+  #       run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
 
       # TODO: Uncomment after verification
       # - name: Deploy to GitHub Pages
@@ -96,7 +97,8 @@ jobs:
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     name: "SGLang Serving Benchmark With SGLang"
-    needs: benchmark_shortfin
+    # TODO: Uncomment after verifying
+    # needs: benchmark_shortfin
     strategy:
       matrix:
         version: [3.11]
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
index e9750fa5a..0c49642c8 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
@@ -15,7 +15,7 @@
 pytest.importorskip("sglang")
 from sglang import bench_serving
 
-from app_tests.benchmark_tests.llm.sglang_benchmarks.utils import (
+from .utils import (
     SGLangBenchmarkArgs,
     log_jsonl_result,
 )

From fc7828484b4c0db1b1244cb4189144faa237c9d3 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 22:15:24 +0000
Subject: [PATCH 03/38] Change `ci-sglang-benchmark/integration` to use
 `mi300x-4`, Update benchmark tests to download model on demand

---
 .github/workflows/ci-sglang-benchmark.yml     |  4 ++--
 .../workflows/ci-sglang-integration-tests.yml |  2 +-
 .../llm/sglang_benchmarks/conftest.py         | 12 ++++++++--
 .../sglang_benchmark_test.py                  | 16 +++++++------
 .../shortfin_benchmark_test.py                | 23 ++++++++++---------
 5 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 5d2125952..265e9edae 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -33,7 +33,7 @@ jobs:
   #     matrix:
   #       version: [3.11]
   #     fail-fast: false
-  #   runs-on: llama-mi300x-3
+  #   runs-on: llama-mi300x-4
   #   defaults:
   #     run:
   #       shell: bash
@@ -103,7 +103,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-3
+    runs-on: llama-mi300x-4
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index 1c382617d..7d51da0eb 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -29,7 +29,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-3
+    runs-on: llama-mi300x-4
     defaults:
       run:
         shell: bash
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
index eceaec857..95d628bf1 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -12,14 +12,19 @@
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 )
-from integration_tests.llm.utils import compile_model, export_paged_llm_v1
+from integration_tests.llm.utils import (
+    compile_model,
+    export_paged_llm_v1,
+    download_with_hf_datasets,
+)
 
 
 @pytest.fixture(scope="module")
 def pre_process_model(request, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    model_path = request.param["model_path"]
+    model_name = request.param["model_name"]
+    model_param_file_name = request.param["model_param_file_name"]
     settings = request.param["settings"]
     batch_sizes = request.param["batch_sizes"]
 
@@ -27,6 +32,9 @@ def pre_process_model(request, tmp_path_factory):
     config_path = tmp_dir / "config.json"
     vmfb_path = tmp_dir / "model.vmfb"
 
+    model_path = tmp_dir / model_param_file_name
+    download_with_hf_datasets(tmp_dir, model_name)
+
     export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes)
 
     config = {
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
index fe6b28cee..43f13abb2 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -15,19 +15,21 @@
 
 from .utils import SGLangBenchmarkArgs, log_jsonl_result
 
-from integration_tests.llm.utils import wait_for_server
+from integration_tests.llm.utils import wait_for_server, download_with_hf_datasets
 
 logger = logging.getLogger(__name__)
 
-TOKENIZER_DIR = Path("/data/llama3.1/8b/")
-
 
 @pytest.mark.parametrize(
-    "request_rate",
-    [1, 2, 4, 8, 16, 32],
+    "request_rate,model_name",
+    [(req_rate, "llama3_8b_f16") for req_rate in [1, 2, 4, 8, 16, 32]],
 )
-def test_sglang_benchmark(request_rate, sglang_args, tmp_path_factory):
+def test_sglang_benchmark(request_rate, model_name, sglang_args, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
+
+    # Download tokenizer for llama3_8b_f16
+    download_with_hf_datasets(tmp_dir, model_name)
+
     logger.info("Beginning SGLang benchmark test...")
 
     port = sglang_args
@@ -41,7 +43,7 @@ def test_sglang_benchmark(request_rate, sglang_args, tmp_path_factory):
         backend="sglang",
         num_prompt=10,
         base_url=f"http://localhost:{port}",
-        tokenizer=TOKENIZER_DIR,
+        tokenizer=tmp_dir,
         request_rate=request_rate,
     )
     output_file = (
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
index 0c49642c8..33c21b104 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py
@@ -35,21 +35,21 @@
     "device": "hip",
 }
 
-# TODO: Download on demand instead of assuming files exist at this path
-MODEL_PATH = Path("/data/llama3.1/8b/llama8b_f16.irpa")
-TOKENIZER_DIR = Path("/data/llama3.1/8b/")
-
 
 @pytest.mark.parametrize(
-    "request_rate",
-    [1, 2, 4, 8, 16, 32],
+    "request_rate,model_param_file_name",
+    [
+        (req_rate, "meta-llama-3.1-8b-instruct.f16.gguf")
+        for req_rate in [1, 2, 4, 8, 16, 32]
+    ],
 )
 @pytest.mark.parametrize(
     "pre_process_model",
     [
         (
             {
-                "model_path": MODEL_PATH,
+                "model_name": "llama3_8B_fp16",
+                "model_param_file_name": "meta-llama-3.1-8b-instruct.f16.gguf",
                 "settings": device_settings,
                 "batch_sizes": [1, 4],
             }
@@ -57,7 +57,7 @@
     ],
     indirect=True,
 )
-def test_shortfin_benchmark(request_rate, pre_process_model):
+def test_shortfin_benchmark(request_rate, model_param_file_name, pre_process_model):
     # TODO: Remove when multi-device is fixed
     os.environ["ROCR_VISIBLE_DEVICES"] = "1"
 
@@ -65,7 +65,8 @@ def test_shortfin_benchmark(request_rate, pre_process_model):
 
     config_path = tmp_dir / "config.json"
     vmfb_path = tmp_dir / "model.vmfb"
-    tokenizer_path = TOKENIZER_DIR / "tokenizer.json"
+    tokenizer_path = tmp_dir / "tokenizer.json"
+    model_path = tmp_dir / model_param_file_name
 
     # Start shortfin llm server
     port = find_available_port()
@@ -74,7 +75,7 @@ def test_shortfin_benchmark(request_rate, pre_process_model):
         tokenizer_path,
         config_path,
         vmfb_path,
-        MODEL_PATH,
+        model_path,
         device_settings,
         timeout=30,
     )
@@ -84,7 +85,7 @@ def test_shortfin_benchmark(request_rate, pre_process_model):
         backend="shortfin",
         num_prompt=10,
         base_url=f"http://localhost:{port}",
-        tokenizer=TOKENIZER_DIR,
+        tokenizer=tmp_dir,
         request_rate=request_rate,
     )
     output_file = (

From 0909e8f1f4893711a964fb903f5ad71f14947978 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 22:21:44 +0000
Subject: [PATCH 04/38] Fix github runner label

---
 .github/workflows/ci-sglang-benchmark.yml         | 6 +++---
 .github/workflows/ci-sglang-integration-tests.yml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 265e9edae..a115702fa 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -14,7 +14,7 @@ on:
     # Weekdays at 6:00 AM UTC = 11:00 PM PST.
     # This is a pretty GPU intensive test, so want to avoid conflicting
     # with other potentially scheduled tests.
-    - cron: "0 6 * * 1-5"
+    - cron: "0 4 * * 1-5"
 
 concurrency:
   # A PR number if a pull request and otherwise the commit hash. This cancels
@@ -33,7 +33,7 @@ jobs:
   #     matrix:
   #       version: [3.11]
   #     fail-fast: false
-  #   runs-on: llama-mi300x-4
+  #   runs-on: mi300x-4
   #   defaults:
   #     run:
   #       shell: bash
@@ -103,7 +103,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-4
+    runs-on: mi300x-4
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index 7d51da0eb..c61756d78 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -29,7 +29,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-4
+    runs-on: mi300x-4
     defaults:
       run:
         shell: bash

From d7cc53925c8c59e9fcf0b1f5acc027a403b6b0aa Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 22:39:19 +0000
Subject: [PATCH 05/38] Add installation steps, since test does require some
 functionality from shortfin/sharktank

---
 .github/workflows/ci-sglang-benchmark.yml | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index a115702fa..975c4c127 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -120,6 +120,35 @@ jobs:
         with:
           python-version: ${{matrix.version}}
 
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-base-compiler==3.0.0rc20241118 \
+            iree-base-runtime==3.0.0rc20241118 \
+            "numpy<2.0"
+
       - name: Install SGLang
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 

From cf16e541a7969b7c08c223c91f64929b959eb555 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 22:51:34 +0000
Subject: [PATCH 06/38] Fix typo in model names

---
 .github/workflows/ci-sglang-benchmark.yml                     | 2 +-
 .../llm/sglang_benchmarks/sglang_benchmark_test.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 975c4c127..ad511f0be 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -183,7 +183,7 @@ jobs:
             --env "HF_TOKEN={{ secrets.HF_TOKEN }}" \
             lmsysorg/sglang:v0.3.5.post1-rocm620 \
             python3 -m sglang.launch_server \
-            --model-path meta-llama/Llama-3.1-8b \
+            --model-path meta-llama/Llama-3.1-8B-Instruct \
             --host 0.0.0.0 \
             --port 30000 \
             --tp 1 \
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
index 43f13abb2..b4006c4e6 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -22,12 +22,12 @@
 
 @pytest.mark.parametrize(
     "request_rate,model_name",
-    [(req_rate, "llama3_8b_f16") for req_rate in [1, 2, 4, 8, 16, 32]],
+    [(req_rate, "llama3_8B_fp16") for req_rate in [1, 2, 4, 8, 16, 32]],
 )
 def test_sglang_benchmark(request_rate, model_name, sglang_args, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    # Download tokenizer for llama3_8b_f16
+    # Download tokenizer for llama3_8B_fp16
     download_with_hf_datasets(tmp_dir, model_name)
 
     logger.info("Beginning SGLang benchmark test...")

From 86058b8f7190b1d18e4b3a172013ec9fd5d2949c Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 23:23:36 +0000
Subject: [PATCH 07/38] Add container name, Add disable-cuda-graph option to
 allow server to properly run

---
 .github/workflows/ci-sglang-benchmark.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index ad511f0be..8ed2eb631 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -169,6 +169,7 @@ jobs:
       - name: Run SGLang Server
         run: |
           docker run -d --rm  \
+            --name=sglang-server \
             --device=/dev/kfd \
             --device=/dev/dri \
             --ipc=host \
@@ -187,7 +188,8 @@ jobs:
             --host 0.0.0.0 \
             --port 30000 \
             --tp 1 \
-            --dtype float16
+            --dtype float16 \
+            --disable-cuda-graph
 
       - name: Run SGLang Benchmark Tests
         run: |

From acbedb0eba6d677b65b289077dc4adda2c5aa32e Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 23:30:31 +0000
Subject: [PATCH 08/38] Temporarily remove `--rm` to try and obtain container
 logs after failure

---
 .github/workflows/ci-sglang-benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 8ed2eb631..a9f4877e3 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -168,7 +168,7 @@ jobs:
 
       - name: Run SGLang Server
         run: |
-          docker run -d --rm  \
+          docker run -d  \
             --name=sglang-server \
             --device=/dev/kfd \
             --device=/dev/dri \

From 34c8410bb5f926e04ae73994bad8db975881b063 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 22 Nov 2024 23:54:35 +0000
Subject: [PATCH 09/38] Remove quotes around HF_TOKEN

---
 .github/workflows/ci-sglang-benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index a9f4877e3..0310fd14e 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -181,7 +181,7 @@ jobs:
             -v /data:/data \
             -p 30000:30000 \
             -v ~/.cache/huggingface:/root/.cache/huggingface \
-            --env "HF_TOKEN={{ secrets.HF_TOKEN }}" \
+            --env HF_TOKEN={{ secrets.HF_TOKEN }} \
             lmsysorg/sglang:v0.3.5.post1-rocm620 \
             python3 -m sglang.launch_server \
             --model-path meta-llama/Llama-3.1-8B-Instruct \

From 0d5574d519df20e879b0b85967ad335d9a781882 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Sat, 23 Nov 2024 00:28:55 +0000
Subject: [PATCH 10/38] Try using env var for HF_SECRET

---
 .github/workflows/ci-sglang-benchmark.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 0310fd14e..fa51b655b 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -110,6 +110,9 @@ jobs:
     env:
       PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
     steps:
+      - name: Set HF_TOKEN
+        run: echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> $GITHUB_ENV
+
       - name: Get Current Date
         id: date
         run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
@@ -181,7 +184,7 @@ jobs:
             -v /data:/data \
             -p 30000:30000 \
             -v ~/.cache/huggingface:/root/.cache/huggingface \
-            --env HF_TOKEN={{ secrets.HF_TOKEN }} \
+            --env HF_TOKEN=$HF_TOKEN \
             lmsysorg/sglang:v0.3.5.post1-rocm620 \
             python3 -m sglang.launch_server \
             --model-path meta-llama/Llama-3.1-8B-Instruct \

From c9f4d338abf9bc0c655590b5256ad15a92579642 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 25 Nov 2024 13:02:48 +0000
Subject: [PATCH 11/38] Move secrets.HF_TOKEN back to command

---
 .github/workflows/ci-sglang-benchmark.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index fa51b655b..47eabc01d 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -110,9 +110,6 @@ jobs:
     env:
       PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
     steps:
-      - name: Set HF_TOKEN
-        run: echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> $GITHUB_ENV
-
       - name: Get Current Date
         id: date
         run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
@@ -184,7 +181,7 @@ jobs:
             -v /data:/data \
             -p 30000:30000 \
             -v ~/.cache/huggingface:/root/.cache/huggingface \
-            --env HF_TOKEN=$HF_TOKEN \
+            --env HF_TOKEN=${{ secrets.HF_TOKEN }} \
             lmsysorg/sglang:v0.3.5.post1-rocm620 \
             python3 -m sglang.launch_server \
             --model-path meta-llama/Llama-3.1-8B-Instruct \

From 4fa094cbc43c76589fe053ab015c1096ccf43f8d Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 25 Nov 2024 13:54:19 +0000
Subject: [PATCH 12/38] Add temporary command to see if HF_TOKEN is being set
 properly

---
 .github/workflows/ci-sglang-benchmark.yml | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 47eabc01d..2dc55e61f 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -162,10 +162,21 @@ jobs:
       #   HIP (Heterogeneous-computing Interface for Portability) for AMD
       #   => base docker rocm/vllm-dev:20241022, not from public vllm whl
       #   srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
-      - name: Pull SGLang Image (Had issues with sglang:v0.3.5.post1-rocm620)
+      - name: Pull SGLang Image (Had issues with sglang:v0.3.5.post2-rocm620)
         run: |
           docker pull lmsysorg/sglang:v0.3.5.post1-rocm620
 
+      - name: Check HF_TOKEN
+        run: |
+          if [ -z "$HF_TOKEN" ]; then
+            echo "Error: HF_TOKEN is not set or empty."
+            exit 1
+          else
+            echo "HF_TOKEN is set"
+          fi
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
       - name: Run SGLang Server
         run: |
           docker run -d  \

From c33ef750995ac60bfe7cd5ed1a23767f7b3d1628 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 25 Nov 2024 14:53:16 +0000
Subject: [PATCH 13/38] Add back command to rm container once stopped

---
 .github/workflows/ci-sglang-benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 2dc55e61f..4aa36897f 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -179,7 +179,7 @@ jobs:
 
       - name: Run SGLang Server
         run: |
-          docker run -d  \
+          docker run --rm -d  \
             --name=sglang-server \
             --device=/dev/kfd \
             --device=/dev/dri \

From fea26559dc1e0a6cf128a907efa3d46c2f113ce5 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 25 Nov 2024 15:25:16 +0000
Subject: [PATCH 14/38] Allow for full e2e verification

---
 .github/workflows/ci-sglang-benchmark.yml | 135 ++++++++++------------
 1 file changed, 60 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 4aa36897f..cd266e43c 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -11,9 +11,7 @@ on:
   pull_request:
   workflow_dispatch:
   schedule:
-    # Weekdays at 6:00 AM UTC = 11:00 PM PST.
-    # This is a pretty GPU intensive test, so want to avoid conflicting
-    # with other potentially scheduled tests.
+    # Weekdays at 4:00 AM UTC = 9:00 PM PST.
     - cron: "0 4 * * 1-5"
 
 concurrency:
@@ -25,65 +23,64 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  # TODO: Uncomment after verification
-  # benchmark_shortfin:
-  #   if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-  #   name: "SGLang Serving Benchmark With Shortfin"
-  #   strategy:
-  #     matrix:
-  #       version: [3.11]
-  #     fail-fast: false
-  #   runs-on: mi300x-4
-  #   defaults:
-  #     run:
-  #       shell: bash
-  #   env:
-  #     PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-  #   steps:
-  #     - name: Get Current Date
-  #       id: date
-  #       run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
-
-  #     - name: "Setting up Python"
-  #       id: setup_python
-  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-  #       with:
-  #         python-version: ${{matrix.version}}
-
-  #     - name: "Checkout Code"
-  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-  #     - name: Cache Pip Packages
-  #       uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
-  #       id: cache-pip
-  #       with:
-  #         path: ${{ env.PIP_CACHE_DIR }}
-  #         key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
-
-  #     - name: Install pip deps
-  #       run: |
-  #         python -m pip install --no-compile --upgrade pip
-  #         # Note: We install in three steps in order to satisfy requirements
-  #         # from non default locations first. Installing the PyTorch CPU
-  #         # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-  #         pip install --no-compile -r pytorch-cpu-requirements.txt
-  #         pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-  #           -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-  #         pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
-
-  #         # Try with the latest nightly releases, not what iree-turbine pins.
-  #         # We could also pin to a known working or stable version.
-  #         # This should eventually stabilize. Do the best we can for now.
-  #         pip install -f https://iree.dev/pip-release-links.html --upgrade \
-  #           iree-base-compiler==3.0.0rc20241118 \
-  #           iree-base-runtime==3.0.0rc20241118 \
-  #           "numpy<2.0"
-
-  #     - name: Install SGLang
-  #       run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
-
-  #     - name: Run Shortfin Benchmark Tests
-  #       run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
+  benchmark_shortfin:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "SGLang Serving Benchmark With Shortfin"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: mi300x-4
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+            iree-base-compiler==3.0.0rc20241118 \
+            iree-base-runtime==3.0.0rc20241118 \
+            "numpy<2.0"
+
+      - name: Install SGLang
+        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+      - name: Run Shortfin Benchmark Tests
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
 
       # TODO: Uncomment after verification
       # - name: Deploy to GitHub Pages
@@ -97,8 +94,7 @@ jobs:
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     name: "SGLang Serving Benchmark With SGLang"
-    # TODO: Uncomment after verifying
-    # needs: benchmark_shortfin
+    needs: benchmark_shortfin
     strategy:
       matrix:
         version: [3.11]
@@ -166,17 +162,6 @@ jobs:
         run: |
           docker pull lmsysorg/sglang:v0.3.5.post1-rocm620
 
-      - name: Check HF_TOKEN
-        run: |
-          if [ -z "$HF_TOKEN" ]; then
-            echo "Error: HF_TOKEN is not set or empty."
-            exit 1
-          else
-            echo "HF_TOKEN is set"
-          fi
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-
       - name: Run SGLang Server
         run: |
           docker run --rm -d  \

From 3641445e7e4e2f50caf6a0cb8644b5f75c520e9f Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 25 Nov 2024 16:47:51 +0000
Subject: [PATCH 15/38] Update hash for pip cache in benchmark and integration
 tests

---
 .github/workflows/ci-sglang-benchmark.yml         | 4 ++--
 .github/workflows/ci-sglang-integration-tests.yml | 2 +-
 .github/workflows/ci-shark-ai.yml                 | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index cd266e43c..f03936699 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -55,7 +55,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
@@ -124,7 +124,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index c61756d78..20b829918 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -54,7 +54,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml
index bf8007e65..fc85a76a7 100644
--- a/.github/workflows/ci-shark-ai.yml
+++ b/.github/workflows/ci-shark-ai.yml
@@ -49,7 +49,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |

From d82d9dffe1f69cdf1d063a95f4acbf3663407a69 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 25 Nov 2024 21:38:54 +0000
Subject: [PATCH 16/38] Remove version pinning for `iree-base-compiler` and
 `iree-base-runtime`

---
 .github/workflows/ci-sglang-benchmark.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index f03936699..5b173baa5 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -72,8 +72,8 @@ jobs:
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241118 \
-            iree-base-runtime==3.0.0rc20241118 \
+            iree-base-compiler \
+            iree-base-runtime \
             "numpy<2.0"
 
       - name: Install SGLang
@@ -141,8 +141,8 @@ jobs:
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241118 \
-            iree-base-runtime==3.0.0rc20241118 \
+            iree-base-compiler \
+            iree-base-runtime \
             "numpy<2.0"
 
       - name: Install SGLang

From e8432819a1c6b808ffff6e6fd5a7359ad61228a2 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 26 Nov 2024 11:42:02 +0000
Subject: [PATCH 17/38] Add `--pre` to iree installations in SGLang tests

---
 .github/workflows/ci-sglang-benchmark.yml         | 4 ++--
 .github/workflows/ci-sglang-integration-tests.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 5b173baa5..555e880eb 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -71,7 +71,7 @@ jobs:
           # Try with the latest nightly releases, not what iree-turbine pins.
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
             iree-base-runtime \
             "numpy<2.0"
@@ -140,7 +140,7 @@ jobs:
           # Try with the latest nightly releases, not what iree-turbine pins.
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
             iree-base-runtime \
             "numpy<2.0"
diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index 20b829918..8ff74a094 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -69,7 +69,7 @@ jobs:
 
           # Use newest possible releases to be able to track commits that may
           # cause errors.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
             iree-base-runtime \
             "numpy<2.0"

From 01da13c507cb6635fcea7b2028cc3d01b4f82816 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 19:47:05 +0000
Subject: [PATCH 18/38] Slightly lower threshold in integration tests, to allow
 still valid, but differing answers to be accepted

---
 app_tests/integration_tests/llm/sglang/sglang_frontend_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
index efab14ea7..72b3d4052 100644
--- a/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
+++ b/app_tests/integration_tests/llm/sglang/sglang_frontend_test.py
@@ -29,7 +29,7 @@
     "device": "hip",
 }
 
-ACCEPTED_THRESHOLD = 0.8
+ACCEPTED_THRESHOLD = 0.7
 
 
 def compute_similarity(model: SentenceTransformer, sentence_1: str, sentence_2: str):

From ed37ef1a9c49580e96bacbc472b765b9ca11ac69 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 20:57:21 +0000
Subject: [PATCH 19/38] Fix `publish_dir` in `Deploy to Github Pages` step

---
 .github/workflows/ci-sglang-benchmark.yml | 32 ++++++++++-------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 555e880eb..435ba8a2f 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -7,8 +7,6 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
-  # TODO: Remove PR trigger after verification
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.
@@ -82,14 +80,13 @@ jobs:
       - name: Run Shortfin Benchmark Tests
         run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
 
-      # TODO: Uncomment after verification
-      # - name: Deploy to GitHub Pages
-      #   uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
-      #   with:
-      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-      #     publish_dir: ./out/llm/sgl_benchmark/shortfin
-      #     destination_dir: ./llm/sgl_benchmark/shortfin
-      #     keep_files: true
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/llm/shortfin
+          destination_dir: ./llm/sgl_benchmark/shortfin
+          keep_files: true
 
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
@@ -197,11 +194,10 @@ jobs:
       - name: Cleanup SGLang Image
         run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
 
-      # TODO: Uncomment after verifying
-      # - name: Deploy to GitHub Pages
-      #   uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
-      #   with:
-      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-      #     publish_dir: ./out/llm/sgl_benchmark/sglang
-      #     destination_dir: ./llm/sgl_benchmark/sglang
-      #     keep_files: true
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/llm/sglang
+          destination_dir: ./llm/sgl_benchmark/sglang
+          keep_files: true

From d29c7bb94d7eb919a970bd3bc597a172b956d9df Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 22:55:38 +0000
Subject: [PATCH 20/38] Remove unneeded deps for SGLang benchmark, Get rid of
 unneeded `Get Current Date` step, Use `matrix.version` for Pip Cache, Leave
 docker image cached on runner, Temporarily enable PR trigger and disable
 shortfin half for CI validation

---
 .github/workflows/ci-sglang-benchmark.yml | 177 ++++++++++------------
 1 file changed, 81 insertions(+), 96 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 435ba8a2f..7a2627e63 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -7,6 +7,8 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
+  # TODO: Temporarily setting PR trigger for CI validation
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.
@@ -21,72 +23,73 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  benchmark_shortfin:
-    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-    name: "SGLang Serving Benchmark With Shortfin"
-    strategy:
-      matrix:
-        version: [3.11]
-      fail-fast: false
-    runs-on: mi300x-4
-    defaults:
-      run:
-        shell: bash
-    env:
-      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-    steps:
-      - name: Get Current Date
-        id: date
-        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
-
-      - name: "Setting up Python"
-        id: setup_python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{matrix.version}}
-
-      - name: "Checkout Code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Cache Pip Packages
-        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
-        id: cache-pip
-        with:
-          path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
-
-      - name: Install pip deps
-        run: |
-          python -m pip install --no-compile --upgrade pip
-          # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile -r pytorch-cpu-requirements.txt
-          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
-
-          # Try with the latest nightly releases, not what iree-turbine pins.
-          # We could also pin to a known working or stable version.
-          # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
-            iree-base-compiler \
-            iree-base-runtime \
-            "numpy<2.0"
-
-      - name: Install SGLang
-        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
-
-      - name: Run Shortfin Benchmark Tests
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
-
-      - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
-        with:
-          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/shortfin
-          destination_dir: ./llm/sgl_benchmark/shortfin
-          keep_files: true
+  # TODO: Temporarily disabling this half for CI validation
+  # benchmark_shortfin:
+  #   if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+  #   name: "SGLang Serving Benchmark With Shortfin"
+  #   strategy:
+  #     matrix:
+  #       version: [3.11]
+  #     fail-fast: false
+  #   runs-on: mi300x-4
+  #   defaults:
+  #     run:
+  #       shell: bash
+  #   env:
+  #     PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+  #   steps:
+  #     - name: Get Current Date
+  #       id: date
+  #       run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+  #     - name: "Setting up Python"
+  #       id: setup_python
+  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+  #       with:
+  #         python-version: ${{matrix.version}}
+
+  #     - name: "Checkout Code"
+  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+  #     - name: Cache Pip Packages
+  #       uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+  #       id: cache-pip
+  #       with:
+  #         path: ${{ env.PIP_CACHE_DIR }}
+  #         key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
+
+  #     - name: Install pip deps
+  #       run: |
+  #         python -m pip install --no-compile --upgrade pip
+  #         # Note: We install in three steps in order to satisfy requirements
+  #         # from non default locations first. Installing the PyTorch CPU
+  #         # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+  #         pip install --no-compile -r pytorch-cpu-requirements.txt
+  #         pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+  #           -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+  #         pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+  #         # Try with the latest nightly releases, not what iree-turbine pins.
+  #         # We could also pin to a known working or stable version.
+  #         # This should eventually stabilize. Do the best we can for now.
+  #         pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
+  #           iree-base-compiler \
+  #           iree-base-runtime \
+  #           "numpy<2.0"
+
+  #     - name: Install SGLang
+  #       run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+  #     - name: Run Shortfin Benchmark Tests
+  #       run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
+
+  #     - name: Deploy to GitHub Pages
+  #       uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+  #       with:
+  #         github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+  #         publish_dir: ./out/llm/shortfin
+  #         destination_dir: ./llm/sgl_benchmark/shortfin
+  #         keep_files: true
 
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
@@ -103,10 +106,6 @@ jobs:
     env:
       PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
     steps:
-      - name: Get Current Date
-        id: date
-        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
-
       - name: "Setting up Python"
         id: setup_python
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
@@ -121,26 +120,14 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
+          key: pip-${{ matrix.version }}-${{ hashFiles('*requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
           python -m pip install --no-compile --upgrade pip
-          # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile -r pytorch-cpu-requirements.txt
-          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
-
-          # Try with the latest nightly releases, not what iree-turbine pins.
-          # We could also pin to a known working or stable version.
-          # This should eventually stabilize. Do the best we can for now.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
-            iree-base-compiler \
-            iree-base-runtime \
-            "numpy<2.0"
+          # Note: Only sharktank is required to use `hf_datasets` script
+          # for downloading model weights.
+          pip install --no-compile -r requirements.txt -e sharktank/
 
       - name: Install SGLang
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
@@ -191,13 +178,11 @@ jobs:
       - name: Stop sglang-server
         run: docker stop sglang-server || true # Stop container if it's running
 
-      - name: Cleanup SGLang Image
-        run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
-
-      - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
-        with:
-          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/sglang
-          destination_dir: ./llm/sgl_benchmark/sglang
-          keep_files: true
+      # TODO: Temporarily disabling for CI Validation
+      # - name: Deploy to GitHub Pages
+      #   uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+      #   with:
+      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+      #     publish_dir: ./out/llm/sglang
+      #     destination_dir: ./llm/sgl_benchmark/sglang
+      #     keep_files: true

From 4529e097c02b9709da79d3bdf074d621a68e43c9 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 23:01:02 +0000
Subject: [PATCH 21/38] Comment out `needs` line for CI validation

---
 .github/workflows/ci-sglang-benchmark.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 7a2627e63..ade855825 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -94,7 +94,8 @@ jobs:
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     name: "SGLang Serving Benchmark With SGLang"
-    needs: benchmark_shortfin
+    # TODO: Temporarily commenting out for CI validation
+    # needs: benchmark_shortfin
     strategy:
       matrix:
         version: [3.11]

From 09e0fb6d91e0b09ab5b59e06e9f9dc0399176f3a Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 23:07:04 +0000
Subject: [PATCH 22/38] Remove temporary disablements, Add back step to clean
 up docker image

---
 .github/workflows/ci-sglang-benchmark.yml | 157 +++++++++++-----------
 1 file changed, 78 insertions(+), 79 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index ade855825..b0bca9bb0 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -7,8 +7,6 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
-  # TODO: Temporarily setting PR trigger for CI validation
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.
@@ -23,79 +21,77 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  # TODO: Temporarily disabling this half for CI validation
-  # benchmark_shortfin:
-  #   if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
-  #   name: "SGLang Serving Benchmark With Shortfin"
-  #   strategy:
-  #     matrix:
-  #       version: [3.11]
-  #     fail-fast: false
-  #   runs-on: mi300x-4
-  #   defaults:
-  #     run:
-  #       shell: bash
-  #   env:
-  #     PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-  #   steps:
-  #     - name: Get Current Date
-  #       id: date
-  #       run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
-
-  #     - name: "Setting up Python"
-  #       id: setup_python
-  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-  #       with:
-  #         python-version: ${{matrix.version}}
-
-  #     - name: "Checkout Code"
-  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-  #     - name: Cache Pip Packages
-  #       uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
-  #       id: cache-pip
-  #       with:
-  #         path: ${{ env.PIP_CACHE_DIR }}
-  #         key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
-
-  #     - name: Install pip deps
-  #       run: |
-  #         python -m pip install --no-compile --upgrade pip
-  #         # Note: We install in three steps in order to satisfy requirements
-  #         # from non default locations first. Installing the PyTorch CPU
-  #         # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-  #         pip install --no-compile -r pytorch-cpu-requirements.txt
-  #         pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-  #           -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-  #         pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
-
-  #         # Try with the latest nightly releases, not what iree-turbine pins.
-  #         # We could also pin to a known working or stable version.
-  #         # This should eventually stabilize. Do the best we can for now.
-  #         pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
-  #           iree-base-compiler \
-  #           iree-base-runtime \
-  #           "numpy<2.0"
-
-  #     - name: Install SGLang
-  #       run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
-
-  #     - name: Run Shortfin Benchmark Tests
-  #       run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
-
-  #     - name: Deploy to GitHub Pages
-  #       uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
-  #       with:
-  #         github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-  #         publish_dir: ./out/llm/shortfin
-  #         destination_dir: ./llm/sgl_benchmark/shortfin
-  #         keep_files: true
+  benchmark_shortfin:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "SGLang Serving Benchmark With Shortfin"
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: mi300x-4
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+    steps:
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Cache Pip Packages
+        uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
+
+      - name: Install pip deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -e sharktank/ shortfin/
+
+          # Try with the latest nightly releases, not what iree-turbine pins.
+          # We could also pin to a known working or stable version.
+          # This should eventually stabilize. Do the best we can for now.
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
+            iree-base-compiler \
+            iree-base-runtime \
+            "numpy<2.0"
+
+      - name: Install SGLang
+        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+
+      - name: Run Shortfin Benchmark Tests
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/llm/shortfin
+          destination_dir: ./llm/sgl_benchmark/shortfin
+          keep_files: true
 
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     name: "SGLang Serving Benchmark With SGLang"
-    # TODO: Temporarily commenting out for CI validation
-    # needs: benchmark_shortfin
+    needs: benchmark_shortfin
     strategy:
       matrix:
         version: [3.11]
@@ -179,11 +175,14 @@ jobs:
       - name: Stop sglang-server
         run: docker stop sglang-server || true # Stop container if it's running
 
-      # TODO: Temporarily disabling for CI Validation
-      # - name: Deploy to GitHub Pages
-      #   uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
-      #   with:
-      #     github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-      #     publish_dir: ./out/llm/sglang
-      #     destination_dir: ./llm/sgl_benchmark/sglang
-      #     keep_files: true
+      # Deleting image after run due to large disk space requirement (83 GB)
+      - name: Cleanup SGLang Image
+        run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out/llm/sglang
+          destination_dir: ./llm/sgl_benchmark/sglang
+          keep_files: true

From 422729ffe6a5e02a503aa1b7ba4e01d6cab6704e Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 23:34:09 +0000
Subject: [PATCH 23/38] Remove `Get Current Date` step in shortfin benchmark
 job

---
 .github/workflows/ci-sglang-benchmark.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index b0bca9bb0..08a28992d 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -35,10 +35,6 @@ jobs:
     env:
       PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
     steps:
-      - name: Get Current Date
-        id: date
-        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
-
       - name: "Setting up Python"
         id: setup_python
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0

From 969b608b26988ec79d3204b945fd955541a0f081 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 2 Dec 2024 23:48:59 +0000
Subject: [PATCH 24/38] Add `README` description to top of CI file

---
 .github/workflows/ci-sglang-benchmark.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 08a28992d..0fc111a8f 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -4,6 +4,18 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+# =================================== README ===================================
+# The `benchmark_sglang` job in this CI is mostly dependent on code outside
+# of the `shark-ai` repo itself. By including it here, we are able to maintain
+# an apples-to-apples comparison between shortfin and SGLang performance in a
+# centralized location, as we place more effort in shortfin LLM performance, and
+# WHILE WE WORK TOWARDS A BETTER ALTERNATIVE.
+
+# We should not be generally repeating this pattern, and should never repeat
+# this pattern outside of specifically benchmarking shortfin apps against
+# external projects, as part of an organized and clearly defined effort.
+# ==============================================================================
+
 name: SGLang Llama Benchmarking Tests
 
 on:

From f67e399aeed6ef2d627c99a1208c8425e931bc43 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 15:42:30 +0000
Subject: [PATCH 25/38] Add job to merge html reports from both benchmark jobs
 and upload to gh-pages

---
 .github/workflows/ci-sglang-benchmark.yml | 61 +++++++++++++++++++----
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 0fc111a8f..e2574d533 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -19,6 +19,8 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
+  # TODO: Temporary trigger to validate workflow
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.
@@ -61,7 +63,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
+          key: pip-${{ matrix.version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |
@@ -86,15 +88,13 @@ jobs:
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
       - name: Run Shortfin Benchmark Tests
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=out/llm/shortfin/index.html
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/shortfin_benchmark_test.py --log-cli-level=INFO --html=shortfin_index.html --self-contained-html
 
-      - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
+      - name: Upload pytest report
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
         with:
-          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/shortfin
-          destination_dir: ./llm/sgl_benchmark/shortfin
-          keep_files: true
+          name: sglang_reports
+          path: shortfin_index.html
 
   benchmark_sglang:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
@@ -178,7 +178,7 @@ jobs:
 
       - name: Run SGLang Benchmark Tests
         run: |
-          pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --port 30000 --log-cli-level=INFO --html=out/llm/sglang/index.html
+          pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --port 30000 --log-cli-level=INFO --html=sglang_index.html --self-contained-html
 
       - name: Stop sglang-server
         run: docker stop sglang-server || true # Stop container if it's running
@@ -187,10 +187,49 @@ jobs:
       - name: Cleanup SGLang Image
         run: docker image rm lmsysorg/sglang:v0.3.5.post1-rocm620
 
+      - name: Upload pytest report
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        with:
+          name: sglang_reports
+          path: sglang_index.html
+
+  merge_and_upload_reports:
+    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    name: "Merge and upload benchmark reports"
+    needs: [benchmark_shortfin, benchmark_sglang]
+    strategy:
+      matrix:
+        version: [3.11]
+      fail-fast: false
+    runs-on: ubuntu-24.04
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: Install pytest-html-merger
+        run: pip install pytest-html-merger
+      - name: Download reports
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
+        with:
+          name: slgang_reports
+          path: reports
+
+      - name: Create merged report directory
+        run: mkdir merged_reports
+
+      - name: Merge html reports
+        run: pytest_html_merger -i reports -o merged_reports/index.html
+
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/sglang
-          destination_dir: ./llm/sgl_benchmark/sglang
+          publish_dir: merged_reports
+          destination_dir: ./llm/sglang
           keep_files: true

From 6909edcf38eaddea3eea13ac4fce383d120c9931 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 15:55:58 +0000
Subject: [PATCH 26/38] Fix upload/download paths

---
 .github/workflows/ci-sglang-benchmark.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index e2574d533..2594c3aad 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -93,7 +93,7 @@ jobs:
       - name: Upload pytest report
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
         with:
-          name: sglang_reports
+          name: shortfin_benchmark
           path: shortfin_index.html
 
   benchmark_sglang:
@@ -190,7 +190,7 @@ jobs:
       - name: Upload pytest report
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
         with:
-          name: sglang_reports
+          name: sglang_benchmark
           path: sglang_index.html
 
   merge_and_upload_reports:
@@ -214,17 +214,20 @@ jobs:
 
       - name: Install pytest-html-merger
         run: pip install pytest-html-merger
+
       - name: Download reports
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
         with:
-          name: slgang_reports
+          name: |
+            shortfin_benchmark
+            sglang_benchmark
           path: reports
 
       - name: Create merged report directory
         run: mkdir merged_reports
 
       - name: Merge html reports
-        run: pytest_html_merger -i reports -o merged_reports/index.html
+        run: pytest_html_merger -i reports/*/*.html -o merged_reports/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

From aa35176fd3c51a0178e246a096b7fa767d3eb0e5 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 16:11:28 +0000
Subject: [PATCH 27/38] Split download into two steps

---
 .github/workflows/ci-sglang-benchmark.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 2594c3aad..491a76557 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -215,13 +215,17 @@ jobs:
       - name: Install pytest-html-merger
         run: pip install pytest-html-merger
 
-      - name: Download reports
+      - name: Download shortfin report
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
         with:
-          name: |
-            shortfin_benchmark
-            sglang_benchmark
-          path: reports
+          name: shortfin_benchmark
+          path: reports/shortfin
+
+      - name: Download sglang report
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
+        with:
+          name: sglang_benchmark
+          path: reports/sglang
 
       - name: Create merged report directory
         run: mkdir merged_reports

From 9578acc7cb87bf5c76e718db9737a38d7ba31964 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 16:34:23 +0000
Subject: [PATCH 28/38] Ensure all html files are in same dir

---
 .github/workflows/ci-sglang-benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 491a76557..629e309ff 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -219,19 +219,19 @@ jobs:
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
         with:
           name: shortfin_benchmark
-          path: reports/shortfin
+          path: reports
 
       - name: Download sglang report
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
         with:
           name: sglang_benchmark
-          path: reports/sglang
+          path: reports
 
       - name: Create merged report directory
         run: mkdir merged_reports
 
       - name: Merge html reports
-        run: pytest_html_merger -i reports/*/*.html -o merged_reports/index.html
+        run: pytest_html_merger -i reports -o merged_reports/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

From b9b9ea518014c1ee62f8df5a28e1844de431ea74 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 16:58:59 +0000
Subject: [PATCH 29/38] Remove PR trigger

---
 .github/workflows/ci-sglang-benchmark.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 629e309ff..8c32415af 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -19,8 +19,6 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
-  # TODO: Temporary trigger to validate workflow
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.

From 526194f68ac1b8a5188492b512a5d831f3933e79 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 19:56:20 +0000
Subject: [PATCH 30/38] Remove `sharktank` installation from SGLang benchmark,
 Always use python3.11 for merging reports, Make merging reports one step,
 Temporarily enable PR trigger for validation

---
 .github/workflows/ci-sglang-benchmark.yml | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 8c32415af..8930a441d 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -19,6 +19,8 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
+  # TODO: Remove after validating in CI
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.
@@ -95,7 +97,6 @@ jobs:
           path: shortfin_index.html
 
   benchmark_sglang:
-    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     name: "SGLang Serving Benchmark With SGLang"
     needs: benchmark_shortfin
     strategy:
@@ -128,9 +129,6 @@ jobs:
       - name: Install pip deps
         run: |
           python -m pip install --no-compile --upgrade pip
-          # Note: Only sharktank is required to use `hf_datasets` script
-          # for downloading model weights.
-          pip install --no-compile -r requirements.txt -e sharktank/
 
       - name: Install SGLang
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
@@ -192,13 +190,8 @@ jobs:
           path: sglang_index.html
 
   merge_and_upload_reports:
-    if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     name: "Merge and upload benchmark reports"
     needs: [benchmark_shortfin, benchmark_sglang]
-    strategy:
-      matrix:
-        version: [3.11]
-      fail-fast: false
     runs-on: ubuntu-24.04
     defaults:
       run:
@@ -208,7 +201,7 @@ jobs:
         id: setup_python
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
-          python-version: ${{matrix.version}}
+          python-version: 3.11
 
       - name: Install pytest-html-merger
         run: pip install pytest-html-merger
@@ -225,11 +218,10 @@ jobs:
           name: sglang_benchmark
           path: reports
 
-      - name: Create merged report directory
-        run: mkdir merged_reports
-
       - name: Merge html reports
-        run: pytest_html_merger -i reports -o merged_reports/index.html
+        run: |
+          mkdir merged_reports
+          pytest_html_merger -i reports -o merged_reports/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

From 57babdf922dd8ffb66e4ff0f1afa2a078759ccc8 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 20:04:30 +0000
Subject: [PATCH 31/38] Use hf to download tokenizer in `sglang_benchmark_test`

---
 .../llm/sglang_benchmarks/sglang_benchmark_test.py     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
index a678f92c1..675f9ef54 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -14,20 +14,20 @@
 
 from .utils import SGLangBenchmarkArgs, log_jsonl_result
 
-from integration_tests.llm.utils import wait_for_server, download_with_hf_datasets
+from integration_tests.llm.utils import download_tokenizer, wait_for_server
 
 logger = logging.getLogger(__name__)
 
 
 @pytest.mark.parametrize(
-    "request_rate,model_name",
-    [(req_rate, "llama3_8B_fp16") for req_rate in [1, 2, 4, 8, 16, 32]],
+    "request_rate,tokenizer_id",
+    [(req_rate, "NousResearch/Meta-Llama-3-8B") for req_rate in [1, 2, 4, 8, 16, 32]],
 )
-def test_sglang_benchmark(request_rate, model_name, sglang_args, tmp_path_factory):
+def test_sglang_benchmark(request_rate, tokenizer_id, sglang_args, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
     # Download tokenizer for llama3_8B_fp16
-    download_with_hf_datasets(tmp_dir, model_name)
+    download_tokenizer(tmp_dir, tokenizer_id)
 
     logger.info("Beginning SGLang benchmark test...")
 

From 8f7f0fbc55e4833f5e53d7ae985090378e7f4463 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 21:25:56 +0000
Subject: [PATCH 32/38] Small cleanup of sglang ci deps section

---
 .github/workflows/ci-sglang-benchmark.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 8930a441d..d5fd2ba12 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -124,14 +124,12 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ matrix.version }}-${{ hashFiles('*requirements*.txt','sharktank/requirements*.txt') }}
+          key: pip-${{ matrix.version }}
 
-      - name: Install pip deps
+      - name: Install SGLang
         run: |
           python -m pip install --no-compile --upgrade pip
-
-      - name: Install SGLang
-        run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
+          pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3

From 305c4b0f811b186593d9d31d9a30749f396467fb Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 21:31:33 +0000
Subject: [PATCH 33/38] Make shortfin/sglang benchmark such that they still run
 sequentially, but are dependent on each other's success

---
 .github/workflows/ci-sglang-benchmark.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index d5fd2ba12..1063291e4 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -99,6 +99,7 @@ jobs:
   benchmark_sglang:
     name: "SGLang Serving Benchmark With SGLang"
     needs: benchmark_shortfin
+    if: always()
     strategy:
       matrix:
         version: [3.11]
@@ -209,12 +210,14 @@ jobs:
         with:
           name: shortfin_benchmark
           path: reports
+        continue-on-error: true
 
       - name: Download sglang report
         uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16
         with:
           name: sglang_benchmark
           path: reports
+        continue-on-error: true
 
       - name: Merge html reports
         run: |

From d78ab73d1679850c6b9a0725861225e90c366612 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 22:19:07 +0000
Subject: [PATCH 34/38] Remove dep on shortfin benchmark in sgl benchmark, Make
 `merge_and_upload_reports` run conditionally on either succeeding

---
 .github/workflows/ci-sglang-benchmark.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 1063291e4..5ee9f08a2 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -98,8 +98,6 @@ jobs:
 
   benchmark_sglang:
     name: "SGLang Serving Benchmark With SGLang"
-    needs: benchmark_shortfin
-    if: always()
     strategy:
       matrix:
         version: [3.11]
@@ -190,7 +188,7 @@ jobs:
 
   merge_and_upload_reports:
     name: "Merge and upload benchmark reports"
-    needs: [benchmark_shortfin, benchmark_sglang]
+    if: success() || needs.benchmark_shortfin.result == 'success' || needs.benchmark_sglang.result == 'success'
     runs-on: ubuntu-24.04
     defaults:
       run:

From 29a82216fbf1a74aaee1ce2687907b9b2d65609f Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 22:22:16 +0000
Subject: [PATCH 35/38] Make sure `merge_and_upload_reports` waits for prior
 jobs to finish

---
 .github/workflows/ci-sglang-benchmark.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 5ee9f08a2..0b8359adc 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -188,7 +188,8 @@ jobs:
 
   merge_and_upload_reports:
     name: "Merge and upload benchmark reports"
-    if: success() || needs.benchmark_shortfin.result == 'success' || needs.benchmark_sglang.result == 'success'
+    needs: [benchmark_shortfin, benchmark_sglang]
+    if: needs.benchmark_shortfin.result == 'success' || needs.benchmark_sglang.result == 'success'
     runs-on: ubuntu-24.04
     defaults:
       run:

From 7efecb81ca4530cb815ba13e4ac6fcbae2c56f1c Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 22:27:27 +0000
Subject: [PATCH 36/38] Move code checkout to first step in `benchmark_sglang`

---
 .github/workflows/ci-sglang-benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 0b8359adc..daea21890 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -109,15 +109,15 @@ jobs:
     env:
       PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
     steps:
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
       - name: "Setting up Python"
         id: setup_python
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: ${{matrix.version}}
 
-      - name: "Checkout Code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
       - name: Cache Pip Packages
         uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
         id: cache-pip

From 1e905736f65a96f8f18a49f3518a663e3c48b7f9 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Tue, 3 Dec 2024 23:01:40 +0000
Subject: [PATCH 37/38] Remove PR trigger

---
 .github/workflows/ci-sglang-benchmark.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 6d1978244..7afec9336 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -19,8 +19,6 @@
 name: SGLang Llama Benchmarking Tests
 
 on:
-  # TODO: Remove after validating in CI
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays at 4:00 AM UTC = 9:00 PM PST.

From b1ec485416d1990ce86080d972e8af2daaf51cd2 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Wed, 4 Dec 2024 16:20:15 +0000
Subject: [PATCH 38/38] Repin `iree-base-compiler` and `iree-base-runtime` due
 to `abort` issue

---
 .github/workflows/ci-sglang-benchmark.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index e301ed2fd..f8f19402a 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -77,8 +77,8 @@ jobs:
           # We could also pin to a known working or stable version.
           # This should eventually stabilize. Do the best we can for now.
           pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
-            iree-base-compiler \
-            iree-base-runtime \
+            iree-base-compiler==3.0.0rc20241118 \
+            iree-base-runtime==3.0.0rc20241118 \
             "numpy<2.0"
 
       - name: Install SGLang
@@ -106,8 +106,7 @@ jobs:
     env:
       PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
     steps:
-      - name: "Checkout Code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: "Setting up Python"
         id: setup_python