diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 649e3d173..db658f801 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -81,7 +81,9 @@ jobs: uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 with: github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} - publish_dir: ./out + publish_dir: ./out/llm/llama/benchmarks + destination_dir: ./llm/llama/benchmarks + keep_files: true - name: Upload llama executable files uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index c06d94f6a..125a0cfdc 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -55,9 +55,11 @@ def setUp(self): "--iree-opt-aggressively-propagate-transposes=true", "--iree-opt-data-tiling=false", "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'", - "--iree-hal-force-indirect-command-buffers=true", "--iree-stream-resource-memory-model=discrete", "--iree-hip-legacy-sync=false", + "--iree-hal-indirect-command-buffers=true", + "--iree-hal-memoization=true", + "--iree-opt-strip-assertions", ] @@ -446,11 +448,6 @@ def setUp(self): f"--input=@{self.decode_args_fp8}/cache_state_f16.npy", "--benchmark_repetitions=3", ] - self.compile_args += [ - "--iree-hal-force-indirect-command-buffers=true", - "--iree-stream-resource-memory-model=discrete", - "--iree-hip-legacy-sync=false", - ] @pytest.mark.xfail( reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException @@ -512,6 +509,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) + self.llama70b_f16_torch_sdpa_artifacts.attention_kernel = "torch" output_shard_file_name = ( self.artifacts_dir / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" @@ -794,6 +792,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) + self.llama405b_f16_torch_sdpa_artifacts.attention_kernel = "torch" output_shard_file_name = ( self.artifacts_dir / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" @@ -803,6 +802,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, json_path=output_json, + skip_decode=True, ) self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb( mlir_path=str(output_mlir),