diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
index 649e3d173..db658f801 100644
--- a/.github/workflows/ci-llama-large-tests.yaml
+++ b/.github/workflows/ci-llama-large-tests.yaml
@@ -81,7 +81,9 @@ jobs:
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out
+          publish_dir: ./out/llm/llama/benchmarks
+          destination_dir: ./llm/llama/benchmarks
+          keep_files: true
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index c06d94f6a..125a0cfdc 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -55,9 +55,11 @@ def setUp(self):
             "--iree-opt-aggressively-propagate-transposes=true",
             "--iree-opt-data-tiling=false",
             "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'",
-            "--iree-hal-force-indirect-command-buffers=true",
             "--iree-stream-resource-memory-model=discrete",
             "--iree-hip-legacy-sync=false",
+            "--iree-hal-indirect-command-buffers=true",
+            "--iree-hal-memoization=true",
+            "--iree-opt-strip-assertions",
         ]
 
 
@@ -446,11 +448,6 @@ def setUp(self):
             f"--input=@{self.decode_args_fp8}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
-        self.compile_args += [
-            "--iree-hal-force-indirect-command-buffers=true",
-            "--iree-stream-resource-memory-model=discrete",
-            "--iree-hip-legacy-sync=false",
-        ]
 
     @pytest.mark.xfail(
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
@@ -512,6 +509,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
         output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
+        self.llama70b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
         output_shard_file_name = (
             self.artifacts_dir
             / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
@@ -794,6 +792,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
         output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
+        self.llama405b_f16_torch_sdpa_artifacts.attention_kernel = "torch"
         output_shard_file_name = (
             self.artifacts_dir
             / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
@@ -803,6 +802,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
         export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
+            skip_decode=True,
         )
         self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb(
             mlir_path=str(output_mlir),