From 7a5ee6867531a3438ea695239de6a542f8938227 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Fri, 8 Nov 2024 10:45:13 -0600
Subject: [PATCH 01/12] Update tests with compile flags and tp flags, try
 nightly iree

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml   |  2 +-
 .github/workflows/ci-llama-quick-tests.yaml   |  2 +-
 sharktank/sharktank/utils/export_artifacts.py | 19 +++++++++++++++----
 .../models/llama/benchmark_amdgpu_test.py     |  4 +---
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index 6a3b764b8..b96d0610a 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -68,7 +68,7 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
-          # Test with pinned nightly releases, not what iree-turbine uses.
+          # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
             iree-base-compiler==3.0.0rc20241118 \
             iree-base-runtime==3.0.0rc20241118
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 6c381b658..276903fc9 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -69,7 +69,7 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
-          # Test with pinned nightly releases, not what iree-turbine uses.
+          # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
             iree-base-compiler==3.0.0rc20241118 \
             iree-base-runtime==3.0.0rc20241118
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 9deade56c..bb21ad941 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -123,15 +123,15 @@ def wrapper(*args, **kwargs):
     def shard_irpa_file(
         self,
         *,
-        gguf_file: str,
+        irpa_file: str,
         output_irpa: str,
     ):
         shard_irpa_args = [
             "python3",
             "-m",
             "sharktank.examples.sharding.shard_llm_dataset",
-            "--gguf-file",
-            gguf_file,
+            "--irpa-file",
+            irpa_file,
             "--output-irpa-file",
             output_irpa,
             "--tensor-parallelism-size",
@@ -202,6 +202,11 @@ def compile_to_vmfb(
             f"{mlir_path}",
             f"--iree-hip-target={self.iree_hip_target}",
             f"--iree-hal-target-backends={self.iree_hal_target_backends}",
+            "--iree-dispatch-creation-enable-aggressive-fusion=true",
+            "--iree-global-opt-propagate-transposes=true",
+            "--iree-opt-aggressively-propagate-transposes=true",
+            "--iree-opt-data-tiling=false",
+            "--iree-preprocessing-pass-pipeline=\"builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)\"",
             f"-o={vmfb_path}",
         ]
         if self.tensor_parallelism_size > 1:
@@ -209,12 +214,17 @@ def compile_to_vmfb(
                 f"--iree-hal-target-device=hip[{i}]"
                 for i in range(self.tensor_parallelism_size)
             ]
+            tp_flags = [
+                "--iree-hal-force-indirect-command-buffers=true",
+                "--iree-stream-resource-memory-model=discrete",
+                "--iree-hip-legacy-sync=false",
+            ]
             compile_args += iree_hal_target_devices
+            compile_args += tp_flags
         if hal_dump_path:
             compile_args += [
                 f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"
             ]
-
         cmd = subprocess.list2cmdline(compile_args)
 
         logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}")
@@ -241,6 +251,7 @@ def iree_benchmark_vmfb(
             compile_cmd: Command used to compile the program, for inclusion in error messages.
         Raises Exception if running fails for some reason.
         """
+        import pdb; pdb.set_trace()
         benchmark_args = []
         if self.tensor_parallelism_size > 1:
             base_irpa_path, _ = os.path.splitext(irpa_path)
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index f70607832..a520c6fcf 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -57,7 +57,6 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/weights/8b")
-        self.gguf_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.gguf"
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa"
         self.tensor_parallelism_size = 1
@@ -173,6 +172,7 @@ def testBenchmark8B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
 
+    @skipif_run_8b_llama
     @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "f16_torch"
@@ -303,7 +303,6 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/weights/70b")
-        self.gguf_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.gguf"
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa"
         self.tensor_parallelism_size = 8
@@ -576,7 +575,6 @@ def setUp(self):
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/weights/405b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
-        self.gguf_path = self.artifacts_dir / "fp16/llama3_405b_f16.gguf"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa"
         self.tensor_parallelism_size = 8
         self.dir_path_405b = self.dir_path / "llama-405b"

From b13f85557ed2ef2c386d08d11b24fc1fe09b3224 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Fri, 8 Nov 2024 11:05:52 -0600
Subject: [PATCH 02/12] Fix formatting, remove pdb

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 sharktank/sharktank/utils/export_artifacts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index bb21ad941..7d23e7293 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -206,7 +206,7 @@ def compile_to_vmfb(
             "--iree-global-opt-propagate-transposes=true",
             "--iree-opt-aggressively-propagate-transposes=true",
             "--iree-opt-data-tiling=false",
-            "--iree-preprocessing-pass-pipeline=\"builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)\"",
+            '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"',
             f"-o={vmfb_path}",
         ]
         if self.tensor_parallelism_size > 1:
@@ -251,7 +251,6 @@ def iree_benchmark_vmfb(
             compile_cmd: Command used to compile the program, for inclusion in error messages.
         Raises Exception if running fails for some reason.
         """
-        import pdb; pdb.set_trace()
         benchmark_args = []
         if self.tensor_parallelism_size > 1:
             base_irpa_path, _ = os.path.splitext(irpa_path)

From 2ce4d635d1400b0edae8e1b6871796ea62bb5983 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Fri, 8 Nov 2024 15:16:50 -0600
Subject: [PATCH 03/12] Update CI comments and args in tests

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml   |  1 +
 .github/workflows/ci-llama-quick-tests.yaml   |  1 +
 sharktank/sharktank/utils/export_artifacts.py | 15 ++++--------
 .../models/llama/benchmark_amdgpu_test.py     | 24 +++++++++++++++++++
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index b96d0610a..72dc6a968 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -68,6 +68,7 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
+
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
             iree-base-compiler==3.0.0rc20241118 \
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 276903fc9..4b7a47ff8 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -69,6 +69,7 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
+
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
             iree-base-compiler==3.0.0rc20241118 \
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 7d23e7293..49a942c66 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -195,6 +195,7 @@ def compile_to_vmfb(
         vmfb_path,
         cwd,
         hal_dump_path: Optional[Path] = None,
+        args: Optional[List[str]] = None,
     ):
         # TODO: Control flag to enable multiple backends
         compile_args = [
@@ -202,11 +203,6 @@ def compile_to_vmfb(
             f"{mlir_path}",
             f"--iree-hip-target={self.iree_hip_target}",
             f"--iree-hal-target-backends={self.iree_hal_target_backends}",
-            "--iree-dispatch-creation-enable-aggressive-fusion=true",
-            "--iree-global-opt-propagate-transposes=true",
-            "--iree-opt-aggressively-propagate-transposes=true",
-            "--iree-opt-data-tiling=false",
-            '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"',
             f"-o={vmfb_path}",
         ]
         if self.tensor_parallelism_size > 1:
@@ -214,17 +210,14 @@ def compile_to_vmfb(
                 f"--iree-hal-target-device=hip[{i}]"
                 for i in range(self.tensor_parallelism_size)
             ]
-            tp_flags = [
-                "--iree-hal-force-indirect-command-buffers=true",
-                "--iree-stream-resource-memory-model=discrete",
-                "--iree-hip-legacy-sync=false",
-            ]
             compile_args += iree_hal_target_devices
-            compile_args += tp_flags
         if hal_dump_path:
             compile_args += [
                 f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"
             ]
+        # Append optional arguments if provided
+        if args:
+            compile_args += args
         cmd = subprocess.list2cmdline(compile_args)
 
         logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}")
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index a520c6fcf..e4e85defa 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -49,6 +49,13 @@ def setUpClass(cls):
 
     def setUp(self):
         self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0")
+        self.compile_args = [
+            "--iree-dispatch-creation-enable-aggressive-fusion=true",
+            "--iree-global-opt-propagate-transposes=true",
+            "--iree-opt-aggressively-propagate-transposes=true",
+            "--iree-opt-data-tiling=false",
+            '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"',
+        ]
 
 
 @is_mi300x
@@ -154,6 +161,7 @@ def testBenchmark8B_f16_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
@@ -195,6 +203,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
@@ -236,6 +245,7 @@ def testBenchmark8B_fp8_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
@@ -277,6 +287,7 @@ def testBenchmark8B_fp8_Non_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
@@ -379,6 +390,11 @@ def setUp(self):
             f"--input=@{self.decode_args_fp8}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.compile_args += [
+            "--iree-hal-force-indirect-command-buffers=true",
+            "--iree-stream-resource-memory-model=discrete",
+            "--iree-hip-legacy-sync=false",
+        ]
 
     @pytest.mark.xfail(
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
@@ -409,6 +425,7 @@ def testBenchmark70B_f16_TP8_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
@@ -454,6 +471,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
@@ -501,6 +519,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
@@ -548,6 +567,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
@@ -680,6 +700,7 @@ def testBenchmark405B_f16_TP8_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
@@ -725,6 +746,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
@@ -772,6 +794,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
@@ -819,6 +842,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
+            args=self.compile_args,
         )
         # benchmark prefill
         self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(

From f47401d61f291b2d16b861561d56e328e5258c93 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Mon, 11 Nov 2024 11:49:49 -0600
Subject: [PATCH 04/12] Add non-decomposed 8b f16 prefill only test nightly

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml   |  5 +--
 .github/workflows/ci-llama-quick-tests.yaml   |  4 +--
 .../sharktank/examples/export_paged_llm_v1.py | 18 +++++++++-
 sharktank/sharktank/utils/export_artifacts.py |  3 ++
 .../models/llama/benchmark_amdgpu_test.py     | 34 +++++++++++++++++++
 5 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index 72dc6a968..cdcbe2410 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -71,8 +71,9 @@ jobs:
 
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241118 \
-            iree-base-runtime==3.0.0rc20241118
+            iree-base-compiler==2.9.0rc20241108 \
+            iree-base-runtime==2.9.0rc20241108 \
+            "numpy<2.0"
 
       - name: Run llama tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 4b7a47ff8..127077655 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -72,8 +72,8 @@ jobs:
 
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==3.0.0rc20241118 \
-            iree-base-runtime==3.0.0rc20241118
+            iree-base-compiler==2.9.0rc20241108 \
+            iree-base-runtime==2.9.0rc20241108 \
 
       - name: Run llama 8b tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
index a740f0bff..d51c1de52 100644
--- a/sharktank/sharktank/examples/export_paged_llm_v1.py
+++ b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -54,8 +54,23 @@ def main():
         help="Enables strictness during export",
         action="store_true",
     )
+<<<<<<< HEAD
     cli.add_quantization_options(parser)
     cli.add_model_options(parser)
+=======
+    parser.add_argument(
+        "--attention-kernel",
+        type=str,
+        default="decomposed",
+        choices=["decomposed", "torch"],
+    )
+    parser.add_argument(
+        "--skip-decode",
+        help="Enables prefill only, skips decode",
+        action="store_true",
+    )
+
+>>>>>>> c745549 (Add non-decomposed 8b f16 prefill only test nightly)
     args = cli.parse(parser)
     dataset_type = cli.get_input_data_files(args)
     dataset_type = "irpa" if "irpa" in dataset_type else "gguf"
@@ -312,7 +327,8 @@ def _(
     bsizes = []
     for bs in args.bs:
         generate_batch_prefill(bs)
-        generate_batch_decode(bs)
+        if not args.skip_decode:
+            generate_batch_decode(bs)
         bsizes.append(bs)
     config = generate_params_json(hp, bsizes, bsizes)
     print("GENERATED!")
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 49a942c66..bd33e1a62 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -160,6 +160,7 @@ def export_to_mlir(
         *,
         mlir_path: str,
         json_path: str,
+        skip_decode: Optional[bool] = None,
     ):
         export_args = [
             "python3",
@@ -170,6 +171,8 @@ def export_to_mlir(
             f"--output-config={json_path}",
             f"--bs={str(self.batch_size)}",
         ]
+        if skip_decode:
+            export_args.append("--skip-decode")
         if self.attention_kernel in ["decomposed", "torch"]:
             export_args.append("--attention-kernel")
             export_args.append(self.attention_kernel)
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index e4e85defa..fa5a7eff9 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -180,6 +180,40 @@ def testBenchmark8B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
 
+    @skipif_run_8b_llama
+    def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
+        output_file_name = self.dir_path_8b / "f16_torch_prefill"
+        output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".mlir", prefix=output_file_name
+        )
+        output_json = self.llama8b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".json", prefix=output_file_name
+        )
+        output_vmfb = self.llama8b_f16_torch_sdpa_artifacts.create_file(
+            suffix=".vmfb", prefix=output_file_name
+        )
+        self.llama8b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
+        export_return_code = self.llama8b_f16_torch_sdpa_artifacts.export_to_mlir(
+            mlir_path=output_mlir,
+            json_path=output_json,
+            skip_decode=True,
+        )
+        self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+            mlir_path=str(output_mlir),
+            vmfb_path=output_vmfb,
+            hal_dump_path=output_file_name,
+            cwd=self.repo_root,
+            args=self.compile_args,
+        )
+        # benchmark prefill
+        self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+            hip_device_id=self.hip_device_id,
+            vmfb_name=output_vmfb,
+            irpa_path=self.irpa_path,
+            args=self.iree_run_prefill_args,
+            cwd=self.repo_root,
+        )
+
     @skipif_run_8b_llama
     @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed(self):

From c3522aa6fb227d7ff4efeece3f981ee38d88d7ae Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Tue, 12 Nov 2024 16:26:51 -0600
Subject: [PATCH 05/12] Update preprocess pass compile flag

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 sharktank/tests/models/llama/benchmark_amdgpu_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index fa5a7eff9..4f63661f6 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -54,7 +54,7 @@ def setUp(self):
             "--iree-global-opt-propagate-transposes=true",
             "--iree-opt-aggressively-propagate-transposes=true",
             "--iree-opt-data-tiling=false",
-            '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"',
+            "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'",
         ]
 
 

From 7695df17f6df5432787058819b56c517fb836c23 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Tue, 12 Nov 2024 16:28:15 -0600
Subject: [PATCH 06/12] Update iree nightly, add prefill torch sdpa test

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml | 4 ++--
 .github/workflows/ci-llama-quick-tests.yaml | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index cdcbe2410..6e0b1e706 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -71,8 +71,8 @@ jobs:
 
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==2.9.0rc20241108 \
-            iree-base-runtime==2.9.0rc20241108 \
+            iree-base-compiler==2.9.1rc20241110 \
+            iree-base-runtime==2.9.1rc20241110 \
             "numpy<2.0"
 
       - name: Run llama tests
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 127077655..4d5fb2c68 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -72,8 +72,9 @@ jobs:
 
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==2.9.0rc20241108 \
-            iree-base-runtime==2.9.0rc20241108 \
+            iree-base-compiler==2.9.1rc20241110 \
+            iree-base-runtime==2.9.1rc20241110 \
+            "numpy<2.0"
 
       - name: Run llama 8b tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama

From c6df3dfcff37d28fc03dd797a896243fbd17dd1d Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Wed, 13 Nov 2024 19:57:54 -0600
Subject: [PATCH 07/12] Get nightly iree, keep iree-turbine numpy

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml | 9 ++++-----
 .github/workflows/ci-llama-quick-tests.yaml | 9 ++++-----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index 6e0b1e706..c8e0ac876 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -64,16 +64,15 @@ jobs:
           pip install --no-compile -r pytorch-cpu-requirements.txt
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-          # Install latest iree-tubrine.
+          # Install latest iree-turbine.
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
 
           # Test with nightly releases, not what iree-turbine uses.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==2.9.1rc20241110 \
-            iree-base-runtime==2.9.1rc20241110 \
-            "numpy<2.0"
+          pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \
+            iree-base-compiler \
+            iree-base-runtime \
 
       - name: Run llama tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 4d5fb2c68..3e2921c4c 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -65,16 +65,15 @@ jobs:
           pip install --no-compile -r pytorch-cpu-requirements.txt
           pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
 
-          # Install latest iree-tubrine.
+          # Install latest iree-turbine.
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
 
           # Test with nightly releases, not what iree-turbine uses.
-          pip install -f https://iree.dev/pip-release-links.html --upgrade \
-            iree-base-compiler==2.9.1rc20241110 \
-            iree-base-runtime==2.9.1rc20241110 \
-            "numpy<2.0"
+          pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \
+            iree-base-compiler \
+            iree-base-runtime \
 
       - name: Run llama 8b tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama

From d8872e2858f777e08f94d7267e23e39d24eb5ba0 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Fri, 15 Nov 2024 18:55:28 -0600
Subject: [PATCH 08/12] Update iree packages to nightly, fix name of test,
 create README

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml        |  4 ++--
 .github/workflows/ci-llama-quick-tests.yaml        |  6 +++---
 sharktank/conftest.py                              | 10 +++++-----
 sharktank/tests/models/llama/README.md             | 14 ++++++++++++++
 .../tests/models/llama/benchmark_amdgpu_test.py    | 14 +++++++-------
 5 files changed, 31 insertions(+), 17 deletions(-)
 create mode 100644 sharktank/tests/models/llama/README.md

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index c8e0ac876..0b4e42eaa 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -70,12 +70,12 @@ jobs:
 
 
           # Test with nightly releases, not what iree-turbine uses.
-          pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
             iree-base-runtime \
 
       - name: Run llama tests
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 3e2921c4c..568c4d295 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -71,12 +71,12 @@ jobs:
 
 
           # Test with nightly releases, not what iree-turbine uses.
-          pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \
+          pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
             iree-base-runtime \
 
-      - name: Run llama 8b tests
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama
+      - name: Run llama 8b f16 decomposed test
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index ed09a1fd1..ca12c3d2c 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -73,17 +73,17 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--run-8b-llama",
+        "--run-quick-llama-test",
         action="store_true",
-        dest="run-8b-llama",
+        dest="--run-quick-llama-test",
         default=False,
-        help="Enable llama 8b benchmarking tests",
+        help="Enable llama 8b f16 decomposed benchmarking test",
     )
 
     parser.addoption(
-        "--run-all-llama",
+        "--run-nightly-llama-tests",
         action="store_true",
-        dest="run-all-llama",
+        dest="run-nightly-llama-tests",
         default=False,
         help="Enable all llama benchmarking tests",
     )
diff --git a/sharktank/tests/models/llama/README.md b/sharktank/tests/models/llama/README.md
new file mode 100644
index 000000000..6adf38588
--- /dev/null
+++ b/sharktank/tests/models/llama/README.md
@@ -0,0 +1,14 @@
+# How to run Llama 3.1 Benchmarking Tests
+In order to run Llama 3.1 8B F16 Decomposed test:
+```
+pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \
+    --run-quick-test --iree-hip-target=gfx942
+```
+
+In order to filter by test, use the -k option. If you
+wanted to only run the Llama 3.1 70B F16 Decomposed test:
+```
+pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \
+    --run-nightly-llama-tests --iree-hip-target=gfx942 \
+    -k 'testBenchmark70B_f16_TP8_Decomposed'
+```
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index 4f63661f6..2b8377424 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -21,9 +21,9 @@
 )
 
 is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
-skipif_run_8b_llama = pytest.mark.skipif(
-    'config.getoption("run-8b-llama") and not config.getoption("run-all-llama")',
-    reason="Skipping largs tests when --run-8b is set.",
+skipif_run_quick_llama_test = pytest.mark.skipif(
+    'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")',
+    reason="Skipping largs tests when --run-quick-llama-test is set.",
 )
 
 
@@ -180,7 +180,7 @@ def testBenchmark8B_f16_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_8b_llama
+    @skipif_run_quick_llama_test
     def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
         output_file_name = self.dir_path_8b / "f16_torch_prefill"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -214,7 +214,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_8b_llama
+    @skipif_run_quick_llama_test
     @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "f16_torch"
@@ -342,7 +342,7 @@ def testBenchmark8B_fp8_Non_Decomposed(self):
 
 
 @is_mi300x
-@skipif_run_8b_llama
+@skipif_run_quick_llama_test
 class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
@@ -622,7 +622,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
 
 
 @is_mi300x
-@skipif_run_8b_llama
+@skipif_run_quick_llama_test
 class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()

From 933d4a6a644da99c3e451df7011857f9133631ad Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Fri, 15 Nov 2024 19:05:55 -0600
Subject: [PATCH 09/12] Fix pytest option dest and missed merge conflict

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 sharktank/conftest.py                               | 2 +-
 sharktank/sharktank/examples/export_paged_llm_v1.py | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index ca12c3d2c..b2383055f 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -75,7 +75,7 @@ def pytest_addoption(parser):
     parser.addoption(
         "--run-quick-llama-test",
         action="store_true",
-        dest="--run-quick-llama-test",
+        dest="run-quick-llama-test",
         default=False,
         help="Enable llama 8b f16 decomposed benchmarking test",
     )
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
index d51c1de52..d5975035a 100644
--- a/sharktank/sharktank/examples/export_paged_llm_v1.py
+++ b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -54,10 +54,6 @@ def main():
         help="Enables strictness during export",
         action="store_true",
     )
-<<<<<<< HEAD
-    cli.add_quantization_options(parser)
-    cli.add_model_options(parser)
-=======
     parser.add_argument(
         "--attention-kernel",
         type=str,
@@ -70,7 +66,8 @@ def main():
         action="store_true",
     )
 
->>>>>>> c745549 (Add non-decomposed 8b f16 prefill only test nightly)
+    cli.add_quantization_options(parser)
+    cli.add_model_options(parser)
     args = cli.parse(parser)
     dataset_type = cli.get_input_data_files(args)
     dataset_type = "irpa" if "irpa" in dataset_type else "gguf"

From 09c5220f8d3a9a9257314b4a011e3df867a4cd7c Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Fri, 15 Nov 2024 19:08:58 -0600
Subject: [PATCH 10/12] Move --skip-decode to model cli helper

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 sharktank/sharktank/examples/export_paged_llm_v1.py | 11 -----------
 sharktank/sharktank/utils/cli.py                    |  5 +++++
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
index d5975035a..791bce87c 100644
--- a/sharktank/sharktank/examples/export_paged_llm_v1.py
+++ b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -54,17 +54,6 @@ def main():
         help="Enables strictness during export",
         action="store_true",
     )
-    parser.add_argument(
-        "--attention-kernel",
-        type=str,
-        default="decomposed",
-        choices=["decomposed", "torch"],
-    )
-    parser.add_argument(
-        "--skip-decode",
-        help="Enables prefill only, skips decode",
-        action="store_true",
-    )
 
     cli.add_quantization_options(parser)
     cli.add_model_options(parser)
diff --git a/sharktank/sharktank/utils/cli.py b/sharktank/sharktank/utils/cli.py
index 84ee741bf..bc0b3b0b6 100644
--- a/sharktank/sharktank/utils/cli.py
+++ b/sharktank/sharktank/utils/cli.py
@@ -69,6 +69,11 @@ def add_model_options(parser: argparse.ArgumentParser):
         default="decomposed",
         choices=["decomposed", "torch"],
     )
+    parser.add_argument(
+        "--skip-decode",
+        help="Enables prefill only, skips decode",
+        action="store_true",
+    )
 
 
 def add_quantization_options(parser: argparse.ArgumentParser):

From ecf4026368fd3c080c284d8e32dc051f8adde1b1 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Tue, 19 Nov 2024 10:43:40 -0600
Subject: [PATCH 11/12] Update tests, TODO: decomposed failing with OUT OF
 RANGE command buffer validation

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml   |  2 +-
 .github/workflows/ci-llama-quick-tests.yaml   |  2 +-
 .../models/llama/benchmark_amdgpu_test.py     | 73 +++++++++++++------
 3 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index 0b4e42eaa..649e3d173 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -72,7 +72,7 @@ jobs:
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
-            iree-base-runtime \
+            iree-base-runtime
 
       - name: Run llama tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html
diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
index 568c4d295..a8c315ec8 100644
--- a/.github/workflows/ci-llama-quick-tests.yaml
+++ b/.github/workflows/ci-llama-quick-tests.yaml
@@ -73,7 +73,7 @@ jobs:
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
-            iree-base-runtime \
+            iree-base-runtime
 
       - name: Run llama 8b f16 decomposed test
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index 2b8377424..c06d94f6a 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -55,6 +55,9 @@ def setUp(self):
             "--iree-opt-aggressively-propagate-transposes=true",
             "--iree-opt-data-tiling=false",
             "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'",
+            "--iree-hal-force-indirect-command-buffers=true",
+            "--iree-stream-resource-memory-model=discrete",
+            "--iree-hip-legacy-sync=false",
         ]
 
 
@@ -63,9 +66,9 @@ class BenchmarkLlama3_1_8B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/data/llama-3.1/weights/8b")
+        self.artifacts_dir = Path("/data/llama3.1/weights/8b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa"
-        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa"
+        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_8b_fp8.irpa"
         self.tensor_parallelism_size = 1
         self.dir_path_8b = self.dir_path / "llama-8b"
         self.temp_dir_8b = Path(self.dir_path_8b)
@@ -103,6 +106,9 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.prefill_args_bs4_128_in_tokens_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128"
+        )
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -114,6 +120,14 @@ def setUp(self):
             f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.iree_run_prefill_nondecomposed_args_fp16 = [
+            "--function=prefill_bs4",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
         self.iree_run_decode_args = [
             "--function=decode_bs4",
             f"--input=@{self.decode_args_f16}/tokens.npy",
@@ -181,6 +195,7 @@ def testBenchmark8B_f16_Decomposed(self):
         )
 
     @skipif_run_quick_llama_test
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
         output_file_name = self.dir_path_8b / "f16_torch_prefill"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -210,7 +225,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_prefill_args,
+            args=self.iree_run_prefill_nondecomposed_args_fp16,
             cwd=self.repo_root,
         )
 
@@ -256,9 +271,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_fp8_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_decomposed"
         output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file(
@@ -298,9 +311,7 @@ def testBenchmark8B_fp8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Compile failure", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_fp8_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_torch"
         output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
@@ -347,7 +358,7 @@ class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/data/llama-3.1/weights/70b")
+        self.artifacts_dir = Path("/data/llama3.1/weights/70b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa"
         self.tensor_parallelism_size = 8
@@ -387,6 +398,9 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.prefill_args_bs4_128_in_tokens_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128"
+        )
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -398,6 +412,14 @@ def setUp(self):
             f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.iree_run_prefill_nondecomposed_args_fp16 = [
+            "--function=prefill_bs4",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
         self.iree_run_decode_args = [
             "--function=decode_bs4",
             f"--input=@{self.decode_args_f16}/tokens.npy",
@@ -524,9 +546,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark70B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_decomposed"
         output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file(
@@ -572,9 +592,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_torch"
         output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
@@ -627,9 +645,9 @@ class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/data/llama-3.1/weights/405b")
+        self.artifacts_dir = Path("/data/llama3.1/weights/405b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
-        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa"
+        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa"
         self.tensor_parallelism_size = 8
         self.dir_path_405b = self.dir_path / "llama-405b"
         self.temp_dir_405b = Path(self.dir_path_405b)
@@ -667,6 +685,9 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.prefill_args_bs4_128_in_tokens_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128"
+        )
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -678,6 +699,14 @@ def setUp(self):
             f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.iree_run_prefill_nondecomposed_args_fp16 = [
+            "--function=prefill_bs4",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
         self.iree_run_decode_args = [
             "--function=decode_bs4",
             f"--input=@{self.decode_args_f16}/tokens.npy",
@@ -799,9 +828,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark405B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_decomposed"
         output_mlir = self.llama405b_fp8_decomposed_artifacts.create_file(
@@ -847,9 +874,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_torch"
         output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file(

From 350ff4747f0a1af1ba91f2c7057efc51e19a7a91 Mon Sep 17 00:00:00 2001
From: aviator19941 <avinash.sharma@amd.com>
Date: Tue, 19 Nov 2024 19:24:26 -0600
Subject: [PATCH 12/12] Update flags and github pages publish_dir and
 destination_dir

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
---
 .github/workflows/ci-llama-large-tests.yaml          |  4 +++-
 .../tests/models/llama/benchmark_amdgpu_test.py      | 12 ++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index 649e3d173..db658f801 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -81,7 +81,9 @@ jobs:
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out
+          publish_dir: ./out/llm/llama/benchmarks
+          destination_dir: ./llm/llama/benchmarks
+          keep_files: true
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index c06d94f6a..125a0cfdc 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -55,9 +55,11 @@ def setUp(self):
             "--iree-opt-aggressively-propagate-transposes=true",
             "--iree-opt-data-tiling=false",
             "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'",
-            "--iree-hal-force-indirect-command-buffers=true",
             "--iree-stream-resource-memory-model=discrete",
             "--iree-hip-legacy-sync=false",
+            "--iree-hal-indirect-command-buffers=true",
+            "--iree-hal-memoization=true",
+            "--iree-opt-strip-assertions",
         ]
 
 
@@ -446,11 +448,6 @@ def setUp(self):
             f"--input=@{self.decode_args_fp8}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
-        self.compile_args += [
-            "--iree-hal-force-indirect-command-buffers=true",
-            "--iree-stream-resource-memory-model=discrete",
-            "--iree-hip-legacy-sync=false",
-        ]
 
     @pytest.mark.xfail(
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
@@ -512,6 +509,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
         output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
+        self.llama70b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
         output_shard_file_name = (
             self.artifacts_dir
             / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
@@ -794,6 +792,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
         output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
+        self.llama405b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
         output_shard_file_name = (
             self.artifacts_dir
             / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
@@ -803,6 +802,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
         export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
+            skip_decode=True,
         )
         self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),