diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 6a3b764b8..db658f801 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -64,23 +64,26 @@ jobs: pip install --no-compile -r pytorch-cpu-requirements.txt pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ - # Install latest iree-tubrine. + # Install latest iree-turbine. pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - # Test with pinned nightly releases, not what iree-turbine uses. - pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==3.0.0rc20241118 \ - iree-base-runtime==3.0.0rc20241118 + + # Test with nightly releases, not what iree-turbine uses. + pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \ + iree-base-compiler \ + iree-base-runtime - name: Run llama tests - run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 with: github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} - publish_dir: ./out + publish_dir: ./out/llm/llama/benchmarks + destination_dir: ./llm/llama/benchmarks + keep_files: true - name: Upload llama executable files uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 6c381b658..a8c315ec8 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -65,17 +65,18 @@ jobs: pip install --no-compile -r pytorch-cpu-requirements.txt pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ - # Install latest iree-tubrine. + # Install latest iree-turbine. pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - # Test with pinned nightly releases, not what iree-turbine uses. - pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==3.0.0rc20241118 \ - iree-base-runtime==3.0.0rc20241118 - - name: Run llama 8b tests - run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama + # Test with nightly releases, not what iree-turbine uses. + pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \ + iree-base-compiler \ + iree-base-runtime + + - name: Run llama 8b f16 decomposed test + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test - name: Upload llama executable files uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 diff --git a/sharktank/conftest.py b/sharktank/conftest.py index ed09a1fd1..b2383055f 100644 --- a/sharktank/conftest.py +++ b/sharktank/conftest.py @@ -73,17 +73,17 @@ def pytest_addoption(parser): ) parser.addoption( - "--run-8b-llama", + "--run-quick-llama-test", action="store_true", - dest="run-8b-llama", + dest="run-quick-llama-test", default=False, - help="Enable llama 8b benchmarking tests", + help="Enable llama 8b f16 decomposed benchmarking test", ) parser.addoption( - "--run-all-llama", + "--run-nightly-llama-tests", action="store_true", - dest="run-all-llama", + dest="run-nightly-llama-tests", default=False, help="Enable all llama benchmarking tests", ) diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py index a740f0bff..791bce87c 100644 --- a/sharktank/sharktank/examples/export_paged_llm_v1.py +++ b/sharktank/sharktank/examples/export_paged_llm_v1.py @@ -54,6 +54,7 @@ def main(): help="Enables strictness during export", action="store_true", ) + cli.add_quantization_options(parser) cli.add_model_options(parser) args = cli.parse(parser) @@ -312,7 +313,8 @@ def _( bsizes = [] for bs in args.bs: generate_batch_prefill(bs) - generate_batch_decode(bs) + if not args.skip_decode: + generate_batch_decode(bs) bsizes.append(bs) config = generate_params_json(hp, bsizes, bsizes) print("GENERATED!") diff --git a/sharktank/sharktank/utils/cli.py b/sharktank/sharktank/utils/cli.py index 84ee741bf..bc0b3b0b6 100644 --- a/sharktank/sharktank/utils/cli.py +++ b/sharktank/sharktank/utils/cli.py @@ -69,6 +69,11 @@ def add_model_options(parser: argparse.ArgumentParser): default="decomposed", choices=["decomposed", "torch"], ) + parser.add_argument( + "--skip-decode", + help="Enables prefill only, skips decode", + action="store_true", + ) def add_quantization_options(parser: argparse.ArgumentParser): diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 9deade56c..bd33e1a62 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -123,15 +123,15 @@ def wrapper(*args, **kwargs): def shard_irpa_file( self, *, - gguf_file: str, + irpa_file: str, output_irpa: str, ): shard_irpa_args = [ "python3", "-m", "sharktank.examples.sharding.shard_llm_dataset", - "--gguf-file", - gguf_file, + "--irpa-file", + irpa_file, "--output-irpa-file", output_irpa, "--tensor-parallelism-size", @@ -160,6 +160,7 @@ def export_to_mlir( *, mlir_path: str, json_path: str, + skip_decode: Optional[bool] = None, ): export_args = [ "python3", @@ -170,6 +171,8 @@ def export_to_mlir( f"--output-config={json_path}", f"--bs={str(self.batch_size)}", ] + if skip_decode: + export_args.append("--skip-decode") if self.attention_kernel in ["decomposed", "torch"]: export_args.append("--attention-kernel") export_args.append(self.attention_kernel) @@ -195,6 +198,7 @@ def compile_to_vmfb( vmfb_path, cwd, hal_dump_path: Optional[Path] = None, + args: Optional[List[str]] = None, ): # TODO: Control flag to enable multiple backends compile_args = [ @@ -214,7 +218,9 @@ def compile_to_vmfb( compile_args += [ f"--iree-hal-dump-executable-files-to={hal_dump_path}/files" ] - + # Append optional arguments if provided + if args: + compile_args += args cmd = subprocess.list2cmdline(compile_args) logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}") diff --git a/sharktank/tests/models/llama/README.md b/sharktank/tests/models/llama/README.md new file mode 100644 index 000000000..6adf38588 --- /dev/null +++ b/sharktank/tests/models/llama/README.md @@ -0,0 +1,14 @@ +# How to run Llama 3.1 Benchmarking Tests +In order to run Llama 3.1 8B F16 Decomposed test: +``` +pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \ + --run-quick-test --iree-hip-target=gfx942 +``` + +In order to filter by test, use the -k option. If you +wanted to only run the Llama 3.1 70B F16 Decomposed test: +``` +pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \ + --run-nightly-llama-tests --iree-hip-target=gfx942 \ + -k 'testBenchmark70B_f16_TP8_Decomposed' +``` diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index f70607832..125a0cfdc 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -21,9 +21,9 @@ ) is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'") -skipif_run_8b_llama = pytest.mark.skipif( - 'config.getoption("run-8b-llama") and not config.getoption("run-all-llama")', - reason="Skipping largs tests when --run-8b is set.", +skipif_run_quick_llama_test = pytest.mark.skipif( + 'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")', + reason="Skipping largs tests when --run-quick-llama-test is set.", ) @@ -49,6 +49,18 @@ def setUpClass(cls): def setUp(self): self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0") + self.compile_args = [ + "--iree-dispatch-creation-enable-aggressive-fusion=true", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-opt-data-tiling=false", + "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'", + "--iree-stream-resource-memory-model=discrete", + "--iree-hip-legacy-sync=false", + "--iree-hal-indirect-command-buffers=true", + "--iree-hal-memoization=true", + "--iree-opt-strip-assertions", + ] @is_mi300x @@ -56,10 +68,9 @@ class BenchmarkLlama3_1_8B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - self.artifacts_dir = Path("/data/llama-3.1/weights/8b") - self.gguf_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.gguf" + self.artifacts_dir = Path("/data/llama3.1/weights/8b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa" - self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa" + self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_8b_fp8.irpa" self.tensor_parallelism_size = 1 self.dir_path_8b = self.dir_path / "llama-8b" self.temp_dir_8b = Path(self.dir_path_8b) @@ -97,6 +108,9 @@ def setUp(self): tensor_parallelism_size=self.tensor_parallelism_size, ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.prefill_args_bs4_128_in_tokens_f16 = ( + self.artifacts_dir / "prefill_args_bs4_128" + ) self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" @@ -108,6 +122,14 @@ def setUp(self): f"--input=@{self.prefill_args_f16}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.iree_run_prefill_nondecomposed_args_fp16 = [ + "--function=prefill_bs4", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy", + "--benchmark_repetitions=3", + ] self.iree_run_decode_args = [ "--function=decode_bs4", f"--input=@{self.decode_args_f16}/tokens.npy", @@ -155,6 +177,7 @@ def testBenchmark8B_f16_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb( @@ -173,6 +196,42 @@ def testBenchmark8B_f16_Decomposed(self): cwd=self.repo_root, ) + @skipif_run_quick_llama_test + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) + def testBenchmark8B_f16_Non_Decomposed_Prefill(self): + output_file_name = self.dir_path_8b / "f16_torch_prefill" + output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name + ) + output_json = self.llama8b_f16_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama8b_f16_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + self.llama8b_f16_torch_sdpa_artifacts.attention_kernel = "torch" + export_return_code = self.llama8b_f16_torch_sdpa_artifacts.export_to_mlir( + mlir_path=output_mlir, + json_path=output_json, + skip_decode=True, + ) + self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, + cwd=self.repo_root, + args=self.compile_args, + ) + # benchmark prefill + self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( + hip_device_id=self.hip_device_id, + vmfb_name=output_vmfb, + irpa_path=self.irpa_path, + args=self.iree_run_prefill_nondecomposed_args_fp16, + cwd=self.repo_root, + ) + + @skipif_run_quick_llama_test @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed(self): output_file_name = self.dir_path_8b / "f16_torch" @@ -195,6 +254,7 @@ def testBenchmark8B_f16_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -213,9 +273,7 @@ def testBenchmark8B_f16_Non_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_fp8_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_decomposed" output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file( @@ -236,6 +294,7 @@ def testBenchmark8B_fp8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb( @@ -254,9 +313,7 @@ def testBenchmark8B_fp8_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Compile failure", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_fp8_Non_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_torch" output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file( @@ -277,6 +334,7 @@ def testBenchmark8B_fp8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -297,13 +355,12 @@ def testBenchmark8B_fp8_Non_Decomposed(self): @is_mi300x -@skipif_run_8b_llama +@skipif_run_quick_llama_test class BenchmarkLlama3_1_70B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - self.artifacts_dir = Path("/data/llama-3.1/weights/70b") - self.gguf_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.gguf" + self.artifacts_dir = Path("/data/llama3.1/weights/70b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa" self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa" self.tensor_parallelism_size = 8 @@ -343,6 +400,9 @@ def setUp(self): tensor_parallelism_size=self.tensor_parallelism_size, ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.prefill_args_bs4_128_in_tokens_f16 = ( + self.artifacts_dir / "prefill_args_bs4_128" + ) self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" @@ -354,6 +414,14 @@ def setUp(self): f"--input=@{self.prefill_args_f16}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.iree_run_prefill_nondecomposed_args_fp16 = [ + "--function=prefill_bs4", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy", + "--benchmark_repetitions=3", + ] self.iree_run_decode_args = [ "--function=decode_bs4", f"--input=@{self.decode_args_f16}/tokens.npy", @@ -410,6 +478,7 @@ def testBenchmark70B_f16_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb( @@ -440,6 +509,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) + self.llama70b_f16_torch_sdpa_artifacts.attention_kernel = "torch" output_shard_file_name = ( self.artifacts_dir / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" @@ -455,6 +525,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -473,9 +544,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark70B_fp8_TP8_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_decomposed" output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file( @@ -502,6 +571,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb( @@ -520,9 +590,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark70B_fp8_TP8_Non_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_torch" output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file( @@ -549,6 +617,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -569,15 +638,14 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self): @is_mi300x -@skipif_run_8b_llama +@skipif_run_quick_llama_test class BenchmarkLlama3_1_405B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - self.artifacts_dir = Path("/data/llama-3.1/weights/405b") + self.artifacts_dir = Path("/data/llama3.1/weights/405b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa" - self.gguf_path = self.artifacts_dir / "fp16/llama3_405b_f16.gguf" - self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa" + self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa" self.tensor_parallelism_size = 8 self.dir_path_405b = self.dir_path / "llama-405b" self.temp_dir_405b = Path(self.dir_path_405b) @@ -615,6 +683,9 @@ def setUp(self): tensor_parallelism_size=self.tensor_parallelism_size, ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.prefill_args_bs4_128_in_tokens_f16 = ( + self.artifacts_dir / "prefill_args_bs4_128" + ) self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" @@ -626,6 +697,14 @@ def setUp(self): f"--input=@{self.prefill_args_f16}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.iree_run_prefill_nondecomposed_args_fp16 = [ + "--function=prefill_bs4", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy", + "--benchmark_repetitions=3", + ] self.iree_run_decode_args = [ "--function=decode_bs4", f"--input=@{self.decode_args_f16}/tokens.npy", @@ -682,6 +761,7 @@ def testBenchmark405B_f16_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb( @@ -712,6 +792,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) + self.llama405b_f16_torch_sdpa_artifacts.attention_kernel = "torch" output_shard_file_name = ( self.artifacts_dir / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" @@ -721,12 +802,14 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, json_path=output_json, + skip_decode=True, ) self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb( mlir_path=str(output_mlir), vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -745,9 +828,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark405B_fp8_TP8_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_decomposed" output_mlir = self.llama405b_fp8_decomposed_artifacts.create_file( @@ -774,6 +855,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb( @@ -792,9 +874,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark405B_fp8_TP8_Non_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_torch" output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file( @@ -821,6 +901,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(