From 7a5ee6867531a3438ea695239de6a542f8938227 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Fri, 8 Nov 2024 10:45:13 -0600 Subject: [PATCH 01/12] Update tests with compile flags and tp flags, try nightly iree Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 2 +- .github/workflows/ci-llama-quick-tests.yaml | 2 +- sharktank/sharktank/utils/export_artifacts.py | 19 +++++++++++++++---- .../models/llama/benchmark_amdgpu_test.py | 4 +--- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 6a3b764b8..b96d0610a 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -68,7 +68,7 @@ jobs: pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - # Test with pinned nightly releases, not what iree-turbine uses. + # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ iree-base-compiler==3.0.0rc20241118 \ iree-base-runtime==3.0.0rc20241118 diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 6c381b658..276903fc9 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -69,7 +69,7 @@ jobs: pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - # Test with pinned nightly releases, not what iree-turbine uses. + # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ iree-base-compiler==3.0.0rc20241118 \ iree-base-runtime==3.0.0rc20241118 diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 9deade56c..bb21ad941 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -123,15 +123,15 @@ def wrapper(*args, **kwargs): def shard_irpa_file( self, *, - gguf_file: str, + irpa_file: str, output_irpa: str, ): shard_irpa_args = [ "python3", "-m", "sharktank.examples.sharding.shard_llm_dataset", - "--gguf-file", - gguf_file, + "--irpa-file", + irpa_file, "--output-irpa-file", output_irpa, "--tensor-parallelism-size", @@ -202,6 +202,11 @@ def compile_to_vmfb( f"{mlir_path}", f"--iree-hip-target={self.iree_hip_target}", f"--iree-hal-target-backends={self.iree_hal_target_backends}", + "--iree-dispatch-creation-enable-aggressive-fusion=true", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-opt-data-tiling=false", + "--iree-preprocessing-pass-pipeline=\"builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)\"", f"-o={vmfb_path}", ] if self.tensor_parallelism_size > 1: @@ -209,12 +214,17 @@ def compile_to_vmfb( f"--iree-hal-target-device=hip[{i}]" for i in range(self.tensor_parallelism_size) ] + tp_flags = [ + "--iree-hal-force-indirect-command-buffers=true", + "--iree-stream-resource-memory-model=discrete", + "--iree-hip-legacy-sync=false", + ] compile_args += iree_hal_target_devices + compile_args += tp_flags if hal_dump_path: compile_args += [ f"--iree-hal-dump-executable-files-to={hal_dump_path}/files" ] - cmd = subprocess.list2cmdline(compile_args) logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}") @@ -241,6 +251,7 @@ def iree_benchmark_vmfb( compile_cmd: Command used to compile the program, for inclusion in error messages. Raises Exception if running fails for some reason. """ + import pdb; pdb.set_trace() benchmark_args = [] if self.tensor_parallelism_size > 1: base_irpa_path, _ = os.path.splitext(irpa_path) diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index f70607832..a520c6fcf 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -57,7 +57,6 @@ def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it self.artifacts_dir = Path("/data/llama-3.1/weights/8b") - self.gguf_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.gguf" self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa" self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa" self.tensor_parallelism_size = 1 @@ -173,6 +172,7 @@ def testBenchmark8B_f16_Decomposed(self): cwd=self.repo_root, ) + @skipif_run_8b_llama @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed(self): output_file_name = self.dir_path_8b / "f16_torch" @@ -303,7 +303,6 @@ def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it self.artifacts_dir = Path("/data/llama-3.1/weights/70b") - self.gguf_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.gguf" self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa" self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa" self.tensor_parallelism_size = 8 @@ -576,7 +575,6 @@ def setUp(self): # TODO: add numpy files to Azure and download from it self.artifacts_dir = Path("/data/llama-3.1/weights/405b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa" - self.gguf_path = self.artifacts_dir / "fp16/llama3_405b_f16.gguf" self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa" self.tensor_parallelism_size = 8 self.dir_path_405b = self.dir_path / "llama-405b" From b13f85557ed2ef2c386d08d11b24fc1fe09b3224 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Fri, 8 Nov 2024 11:05:52 -0600 Subject: [PATCH 02/12] Fix formatting, remove pdb Signed-off-by: aviator19941 --- sharktank/sharktank/utils/export_artifacts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index bb21ad941..7d23e7293 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -206,7 +206,7 @@ def compile_to_vmfb( "--iree-global-opt-propagate-transposes=true", "--iree-opt-aggressively-propagate-transposes=true", "--iree-opt-data-tiling=false", - "--iree-preprocessing-pass-pipeline=\"builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)\"", + '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"', f"-o={vmfb_path}", ] if self.tensor_parallelism_size > 1: @@ -251,7 +251,6 @@ def iree_benchmark_vmfb( compile_cmd: Command used to compile the program, for inclusion in error messages. Raises Exception if running fails for some reason. """ - import pdb; pdb.set_trace() benchmark_args = [] if self.tensor_parallelism_size > 1: base_irpa_path, _ = os.path.splitext(irpa_path) From 2ce4d635d1400b0edae8e1b6871796ea62bb5983 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Fri, 8 Nov 2024 15:16:50 -0600 Subject: [PATCH 03/12] Update CI comments and args in tests Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 1 + .github/workflows/ci-llama-quick-tests.yaml | 1 + sharktank/sharktank/utils/export_artifacts.py | 15 ++++-------- .../models/llama/benchmark_amdgpu_test.py | 24 +++++++++++++++++++ 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index b96d0610a..72dc6a968 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -68,6 +68,7 @@ jobs: pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" + # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ iree-base-compiler==3.0.0rc20241118 \ diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 276903fc9..4b7a47ff8 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -69,6 +69,7 @@ jobs: pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" + # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ iree-base-compiler==3.0.0rc20241118 \ diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 7d23e7293..49a942c66 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -195,6 +195,7 @@ def compile_to_vmfb( vmfb_path, cwd, hal_dump_path: Optional[Path] = None, + args: Optional[List[str]] = None, ): # TODO: Control flag to enable multiple backends compile_args = [ @@ -202,11 +203,6 @@ def compile_to_vmfb( f"{mlir_path}", f"--iree-hip-target={self.iree_hip_target}", f"--iree-hal-target-backends={self.iree_hal_target_backends}", - "--iree-dispatch-creation-enable-aggressive-fusion=true", - "--iree-global-opt-propagate-transposes=true", - "--iree-opt-aggressively-propagate-transposes=true", - "--iree-opt-data-tiling=false", - '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"', f"-o={vmfb_path}", ] if self.tensor_parallelism_size > 1: @@ -214,17 +210,14 @@ def compile_to_vmfb( f"--iree-hal-target-device=hip[{i}]" for i in range(self.tensor_parallelism_size) ] - tp_flags = [ - "--iree-hal-force-indirect-command-buffers=true", - "--iree-stream-resource-memory-model=discrete", - "--iree-hip-legacy-sync=false", - ] compile_args += iree_hal_target_devices - compile_args += tp_flags if hal_dump_path: compile_args += [ f"--iree-hal-dump-executable-files-to={hal_dump_path}/files" ] + # Append optional arguments if provided + if args: + compile_args += args cmd = subprocess.list2cmdline(compile_args) logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}") diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index a520c6fcf..e4e85defa 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -49,6 +49,13 @@ def setUpClass(cls): def setUp(self): self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0") + self.compile_args = [ + "--iree-dispatch-creation-enable-aggressive-fusion=true", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-opt-data-tiling=false", + '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"', + ] @is_mi300x @@ -154,6 +161,7 @@ def testBenchmark8B_f16_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb( @@ -195,6 +203,7 @@ def testBenchmark8B_f16_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -236,6 +245,7 @@ def testBenchmark8B_fp8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb( @@ -277,6 +287,7 @@ def testBenchmark8B_fp8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -379,6 +390,11 @@ def setUp(self): f"--input=@{self.decode_args_fp8}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.compile_args += [ + "--iree-hal-force-indirect-command-buffers=true", + "--iree-stream-resource-memory-model=discrete", + "--iree-hip-legacy-sync=false", + ] @pytest.mark.xfail( reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException @@ -409,6 +425,7 @@ def testBenchmark70B_f16_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb( @@ -454,6 +471,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -501,6 +519,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb( @@ -548,6 +567,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -680,6 +700,7 @@ def testBenchmark405B_f16_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb( @@ -725,6 +746,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( @@ -772,6 +794,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb( @@ -819,6 +842,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self): vmfb_path=output_vmfb, hal_dump_path=output_file_name, cwd=self.repo_root, + args=self.compile_args, ) # benchmark prefill self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb( From f47401d61f291b2d16b861561d56e328e5258c93 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Mon, 11 Nov 2024 11:49:49 -0600 Subject: [PATCH 04/12] Add non-decomposed 8b f16 prefill only test nightly Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 5 +-- .github/workflows/ci-llama-quick-tests.yaml | 4 +-- .../sharktank/examples/export_paged_llm_v1.py | 18 +++++++++- sharktank/sharktank/utils/export_artifacts.py | 3 ++ .../models/llama/benchmark_amdgpu_test.py | 34 +++++++++++++++++++ 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 72dc6a968..cdcbe2410 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -71,8 +71,9 @@ jobs: # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==3.0.0rc20241118 \ - iree-base-runtime==3.0.0rc20241118 + iree-base-compiler==2.9.0rc20241108 \ + iree-base-runtime==2.9.0rc20241108 \ + "numpy<2.0" - name: Run llama tests run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 4b7a47ff8..127077655 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -72,8 +72,8 @@ jobs: # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==3.0.0rc20241118 \ - iree-base-runtime==3.0.0rc20241118 + iree-base-compiler==2.9.0rc20241108 \ + iree-base-runtime==2.9.0rc20241108 \ - name: Run llama 8b tests run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py index a740f0bff..d51c1de52 100644 --- a/sharktank/sharktank/examples/export_paged_llm_v1.py +++ b/sharktank/sharktank/examples/export_paged_llm_v1.py @@ -54,8 +54,23 @@ def main(): help="Enables strictness during export", action="store_true", ) +<<<<<<< HEAD cli.add_quantization_options(parser) cli.add_model_options(parser) +======= + parser.add_argument( + "--attention-kernel", + type=str, + default="decomposed", + choices=["decomposed", "torch"], + ) + parser.add_argument( + "--skip-decode", + help="Enables prefill only, skips decode", + action="store_true", + ) + +>>>>>>> c745549 (Add non-decomposed 8b f16 prefill only test nightly) args = cli.parse(parser) dataset_type = cli.get_input_data_files(args) dataset_type = "irpa" if "irpa" in dataset_type else "gguf" @@ -312,7 +327,8 @@ def _( bsizes = [] for bs in args.bs: generate_batch_prefill(bs) - generate_batch_decode(bs) + if not args.skip_decode: + generate_batch_decode(bs) bsizes.append(bs) config = generate_params_json(hp, bsizes, bsizes) print("GENERATED!") diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 49a942c66..bd33e1a62 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -160,6 +160,7 @@ def export_to_mlir( *, mlir_path: str, json_path: str, + skip_decode: Optional[bool] = None, ): export_args = [ "python3", @@ -170,6 +171,8 @@ def export_to_mlir( f"--output-config={json_path}", f"--bs={str(self.batch_size)}", ] + if skip_decode: + export_args.append("--skip-decode") if self.attention_kernel in ["decomposed", "torch"]: export_args.append("--attention-kernel") export_args.append(self.attention_kernel) diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index e4e85defa..fa5a7eff9 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -180,6 +180,40 @@ def testBenchmark8B_f16_Decomposed(self): cwd=self.repo_root, ) + @skipif_run_8b_llama + def testBenchmark8B_f16_Non_Decomposed_Prefill(self): + output_file_name = self.dir_path_8b / "f16_torch_prefill" + output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file( + suffix=".mlir", prefix=output_file_name + ) + output_json = self.llama8b_f16_torch_sdpa_artifacts.create_file( + suffix=".json", prefix=output_file_name + ) + output_vmfb = self.llama8b_f16_torch_sdpa_artifacts.create_file( + suffix=".vmfb", prefix=output_file_name + ) + self.llama8b_f16_torch_sdpa_artifacts.attention_kernel = "torch" + export_return_code = self.llama8b_f16_torch_sdpa_artifacts.export_to_mlir( + mlir_path=output_mlir, + json_path=output_json, + skip_decode=True, + ) + self.llama8b_f16_torch_sdpa_artifacts.compile_to_vmfb( + mlir_path=str(output_mlir), + vmfb_path=output_vmfb, + hal_dump_path=output_file_name, + cwd=self.repo_root, + args=self.compile_args, + ) + # benchmark prefill + self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb( + hip_device_id=self.hip_device_id, + vmfb_name=output_vmfb, + irpa_path=self.irpa_path, + args=self.iree_run_prefill_args, + cwd=self.repo_root, + ) + @skipif_run_8b_llama @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed(self): From c3522aa6fb227d7ff4efeece3f981ee38d88d7ae Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Tue, 12 Nov 2024 16:26:51 -0600 Subject: [PATCH 05/12] Update preprocess pass compile flag Signed-off-by: aviator19941 --- sharktank/tests/models/llama/benchmark_amdgpu_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index fa5a7eff9..4f63661f6 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -54,7 +54,7 @@ def setUp(self): "--iree-global-opt-propagate-transposes=true", "--iree-opt-aggressively-propagate-transposes=true", "--iree-opt-data-tiling=false", - '--iree-preprocessing-pass-pipeline="builtin.module\\(util.func\\(iree-preprocessing-generalize-linalg-matmul-experimental\\)\\)"', + "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'", ] From 7695df17f6df5432787058819b56c517fb836c23 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Tue, 12 Nov 2024 16:28:15 -0600 Subject: [PATCH 06/12] Update iree nightly, add prefill torch sdpa test Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 4 ++-- .github/workflows/ci-llama-quick-tests.yaml | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index cdcbe2410..6e0b1e706 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -71,8 +71,8 @@ jobs: # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==2.9.0rc20241108 \ - iree-base-runtime==2.9.0rc20241108 \ + iree-base-compiler==2.9.1rc20241110 \ + iree-base-runtime==2.9.1rc20241110 \ "numpy<2.0" - name: Run llama tests diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 127077655..4d5fb2c68 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -72,8 +72,9 @@ jobs: # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==2.9.0rc20241108 \ - iree-base-runtime==2.9.0rc20241108 \ + iree-base-compiler==2.9.1rc20241110 \ + iree-base-runtime==2.9.1rc20241110 \ + "numpy<2.0" - name: Run llama 8b tests run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama From c6df3dfcff37d28fc03dd797a896243fbd17dd1d Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Wed, 13 Nov 2024 19:57:54 -0600 Subject: [PATCH 07/12] Get nightly iree, keep iree-turbine numpy Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 9 ++++----- .github/workflows/ci-llama-quick-tests.yaml | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 6e0b1e706..c8e0ac876 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -64,16 +64,15 @@ jobs: pip install --no-compile -r pytorch-cpu-requirements.txt pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ - # Install latest iree-tubrine. + # Install latest iree-turbine. pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" # Test with nightly releases, not what iree-turbine uses. - pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==2.9.1rc20241110 \ - iree-base-runtime==2.9.1rc20241110 \ - "numpy<2.0" + pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \ + iree-base-compiler \ + iree-base-runtime \ - name: Run llama tests run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 4d5fb2c68..3e2921c4c 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -65,16 +65,15 @@ jobs: pip install --no-compile -r pytorch-cpu-requirements.txt pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ - # Install latest iree-tubrine. + # Install latest iree-turbine. pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" # Test with nightly releases, not what iree-turbine uses. - pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-base-compiler==2.9.1rc20241110 \ - iree-base-runtime==2.9.1rc20241110 \ - "numpy<2.0" + pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \ + iree-base-compiler \ + iree-base-runtime \ - name: Run llama 8b tests run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama From d8872e2858f777e08f94d7267e23e39d24eb5ba0 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Fri, 15 Nov 2024 18:55:28 -0600 Subject: [PATCH 08/12] Update iree packages to nightly, fix name of test, create README Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 4 ++-- .github/workflows/ci-llama-quick-tests.yaml | 6 +++--- sharktank/conftest.py | 10 +++++----- sharktank/tests/models/llama/README.md | 14 ++++++++++++++ .../tests/models/llama/benchmark_amdgpu_test.py | 14 +++++++------- 5 files changed, 31 insertions(+), 17 deletions(-) create mode 100644 sharktank/tests/models/llama/README.md diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index c8e0ac876..0b4e42eaa 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -70,12 +70,12 @@ jobs: # Test with nightly releases, not what iree-turbine uses. - pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \ + pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \ iree-base-compiler \ iree-base-runtime \ - name: Run llama tests - run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 3e2921c4c..568c4d295 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -71,12 +71,12 @@ jobs: # Test with nightly releases, not what iree-turbine uses. - pip install --upgrade --pre --no-cache-dir -f https://iree.dev/pip-release-links.html \ + pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \ iree-base-compiler \ iree-base-runtime \ - - name: Run llama 8b tests - run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama + - name: Run llama 8b f16 decomposed test + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test - name: Upload llama executable files uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 diff --git a/sharktank/conftest.py b/sharktank/conftest.py index ed09a1fd1..ca12c3d2c 100644 --- a/sharktank/conftest.py +++ b/sharktank/conftest.py @@ -73,17 +73,17 @@ def pytest_addoption(parser): ) parser.addoption( - "--run-8b-llama", + "--run-quick-llama-test", action="store_true", - dest="run-8b-llama", + dest="--run-quick-llama-test", default=False, - help="Enable llama 8b benchmarking tests", + help="Enable llama 8b f16 decomposed benchmarking test", ) parser.addoption( - "--run-all-llama", + "--run-nightly-llama-tests", action="store_true", - dest="run-all-llama", + dest="run-nightly-llama-tests", default=False, help="Enable all llama benchmarking tests", ) diff --git a/sharktank/tests/models/llama/README.md b/sharktank/tests/models/llama/README.md new file mode 100644 index 000000000..6adf38588 --- /dev/null +++ b/sharktank/tests/models/llama/README.md @@ -0,0 +1,14 @@ +# How to run Llama 3.1 Benchmarking Tests +In order to run Llama 3.1 8B F16 Decomposed test: +``` +pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \ + --run-quick-test --iree-hip-target=gfx942 +``` + +In order to filter by test, use the -k option. If you +wanted to only run the Llama 3.1 70B F16 Decomposed test: +``` +pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \ + --run-nightly-llama-tests --iree-hip-target=gfx942 \ + -k 'testBenchmark70B_f16_TP8_Decomposed' +``` diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index 4f63661f6..2b8377424 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -21,9 +21,9 @@ ) is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'") -skipif_run_8b_llama = pytest.mark.skipif( - 'config.getoption("run-8b-llama") and not config.getoption("run-all-llama")', - reason="Skipping largs tests when --run-8b is set.", +skipif_run_quick_llama_test = pytest.mark.skipif( + 'config.getoption("run-quick-llama-test") and not config.getoption("run-nightly-llama-tests")', + reason="Skipping largs tests when --run-quick-llama-test is set.", ) @@ -180,7 +180,7 @@ def testBenchmark8B_f16_Decomposed(self): cwd=self.repo_root, ) - @skipif_run_8b_llama + @skipif_run_quick_llama_test def testBenchmark8B_f16_Non_Decomposed_Prefill(self): output_file_name = self.dir_path_8b / "f16_torch_prefill" output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file( @@ -214,7 +214,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self): cwd=self.repo_root, ) - @skipif_run_8b_llama + @skipif_run_quick_llama_test @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed(self): output_file_name = self.dir_path_8b / "f16_torch" @@ -342,7 +342,7 @@ def testBenchmark8B_fp8_Non_Decomposed(self): @is_mi300x -@skipif_run_8b_llama +@skipif_run_quick_llama_test class BenchmarkLlama3_1_70B(BaseBenchmarkTest): def setUp(self): super().setUp() @@ -622,7 +622,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self): @is_mi300x -@skipif_run_8b_llama +@skipif_run_quick_llama_test class BenchmarkLlama3_1_405B(BaseBenchmarkTest): def setUp(self): super().setUp() From 933d4a6a644da99c3e451df7011857f9133631ad Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Fri, 15 Nov 2024 19:05:55 -0600 Subject: [PATCH 09/12] Fix pytest option dest and missed merge conflict Signed-off-by: aviator19941 --- sharktank/conftest.py | 2 +- sharktank/sharktank/examples/export_paged_llm_v1.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sharktank/conftest.py b/sharktank/conftest.py index ca12c3d2c..b2383055f 100644 --- a/sharktank/conftest.py +++ b/sharktank/conftest.py @@ -75,7 +75,7 @@ def pytest_addoption(parser): parser.addoption( "--run-quick-llama-test", action="store_true", - dest="--run-quick-llama-test", + dest="run-quick-llama-test", default=False, help="Enable llama 8b f16 decomposed benchmarking test", ) diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py index d51c1de52..d5975035a 100644 --- a/sharktank/sharktank/examples/export_paged_llm_v1.py +++ b/sharktank/sharktank/examples/export_paged_llm_v1.py @@ -54,10 +54,6 @@ def main(): help="Enables strictness during export", action="store_true", ) -<<<<<<< HEAD - cli.add_quantization_options(parser) - cli.add_model_options(parser) -======= parser.add_argument( "--attention-kernel", type=str, @@ -70,7 +66,8 @@ def main(): action="store_true", ) ->>>>>>> c745549 (Add non-decomposed 8b f16 prefill only test nightly) + cli.add_quantization_options(parser) + cli.add_model_options(parser) args = cli.parse(parser) dataset_type = cli.get_input_data_files(args) dataset_type = "irpa" if "irpa" in dataset_type else "gguf" From 09c5220f8d3a9a9257314b4a011e3df867a4cd7c Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Fri, 15 Nov 2024 19:08:58 -0600 Subject: [PATCH 10/12] Move --skip-decode to model cli helper Signed-off-by: aviator19941 --- sharktank/sharktank/examples/export_paged_llm_v1.py | 11 ----------- sharktank/sharktank/utils/cli.py | 5 +++++ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py index d5975035a..791bce87c 100644 --- a/sharktank/sharktank/examples/export_paged_llm_v1.py +++ b/sharktank/sharktank/examples/export_paged_llm_v1.py @@ -54,17 +54,6 @@ def main(): help="Enables strictness during export", action="store_true", ) - parser.add_argument( - "--attention-kernel", - type=str, - default="decomposed", - choices=["decomposed", "torch"], - ) - parser.add_argument( - "--skip-decode", - help="Enables prefill only, skips decode", - action="store_true", - ) cli.add_quantization_options(parser) cli.add_model_options(parser) diff --git a/sharktank/sharktank/utils/cli.py b/sharktank/sharktank/utils/cli.py index 84ee741bf..bc0b3b0b6 100644 --- a/sharktank/sharktank/utils/cli.py +++ b/sharktank/sharktank/utils/cli.py @@ -69,6 +69,11 @@ def add_model_options(parser: argparse.ArgumentParser): default="decomposed", choices=["decomposed", "torch"], ) + parser.add_argument( + "--skip-decode", + help="Enables prefill only, skips decode", + action="store_true", + ) def add_quantization_options(parser: argparse.ArgumentParser): From ecf4026368fd3c080c284d8e32dc051f8adde1b1 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Tue, 19 Nov 2024 10:43:40 -0600 Subject: [PATCH 11/12] Update tests, TODO: decomposed failing with OUT OF RANGE command buffer validation Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 2 +- .github/workflows/ci-llama-quick-tests.yaml | 2 +- .../models/llama/benchmark_amdgpu_test.py | 73 +++++++++++++------ 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 0b4e42eaa..649e3d173 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -72,7 +72,7 @@ jobs: # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \ iree-base-compiler \ - iree-base-runtime \ + iree-base-runtime - name: Run llama tests run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml index 568c4d295..a8c315ec8 100644 --- a/.github/workflows/ci-llama-quick-tests.yaml +++ b/.github/workflows/ci-llama-quick-tests.yaml @@ -73,7 +73,7 @@ jobs: # Test with nightly releases, not what iree-turbine uses. pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \ iree-base-compiler \ - iree-base-runtime \ + iree-base-runtime - name: Run llama 8b f16 decomposed test run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index 2b8377424..c06d94f6a 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -55,6 +55,9 @@ def setUp(self): "--iree-opt-aggressively-propagate-transposes=true", "--iree-opt-data-tiling=false", "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'", + "--iree-hal-force-indirect-command-buffers=true", + "--iree-stream-resource-memory-model=discrete", + "--iree-hip-legacy-sync=false", ] @@ -63,9 +66,9 @@ class BenchmarkLlama3_1_8B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - self.artifacts_dir = Path("/data/llama-3.1/weights/8b") + self.artifacts_dir = Path("/data/llama3.1/weights/8b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa" - self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa" + self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_8b_fp8.irpa" self.tensor_parallelism_size = 1 self.dir_path_8b = self.dir_path / "llama-8b" self.temp_dir_8b = Path(self.dir_path_8b) @@ -103,6 +106,9 @@ def setUp(self): tensor_parallelism_size=self.tensor_parallelism_size, ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.prefill_args_bs4_128_in_tokens_f16 = ( + self.artifacts_dir / "prefill_args_bs4_128" + ) self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" @@ -114,6 +120,14 @@ def setUp(self): f"--input=@{self.prefill_args_f16}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.iree_run_prefill_nondecomposed_args_fp16 = [ + "--function=prefill_bs4", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy", + "--benchmark_repetitions=3", + ] self.iree_run_decode_args = [ "--function=decode_bs4", f"--input=@{self.decode_args_f16}/tokens.npy", @@ -181,6 +195,7 @@ def testBenchmark8B_f16_Decomposed(self): ) @skipif_run_quick_llama_test + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed_Prefill(self): output_file_name = self.dir_path_8b / "f16_torch_prefill" output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file( @@ -210,7 +225,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self): hip_device_id=self.hip_device_id, vmfb_name=output_vmfb, irpa_path=self.irpa_path, - args=self.iree_run_prefill_args, + args=self.iree_run_prefill_nondecomposed_args_fp16, cwd=self.repo_root, ) @@ -256,9 +271,7 @@ def testBenchmark8B_f16_Non_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_fp8_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_decomposed" output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file( @@ -298,9 +311,7 @@ def testBenchmark8B_fp8_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Compile failure", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_fp8_Non_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_torch" output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file( @@ -347,7 +358,7 @@ class BenchmarkLlama3_1_70B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - self.artifacts_dir = Path("/data/llama-3.1/weights/70b") + self.artifacts_dir = Path("/data/llama3.1/weights/70b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa" self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa" self.tensor_parallelism_size = 8 @@ -387,6 +398,9 @@ def setUp(self): tensor_parallelism_size=self.tensor_parallelism_size, ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.prefill_args_bs4_128_in_tokens_f16 = ( + self.artifacts_dir / "prefill_args_bs4_128" + ) self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" @@ -398,6 +412,14 @@ def setUp(self): f"--input=@{self.prefill_args_f16}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.iree_run_prefill_nondecomposed_args_fp16 = [ + "--function=prefill_bs4", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy", + "--benchmark_repetitions=3", + ] self.iree_run_decode_args = [ "--function=decode_bs4", f"--input=@{self.decode_args_f16}/tokens.npy", @@ -524,9 +546,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark70B_fp8_TP8_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_decomposed" output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file( @@ -572,9 +592,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark70B_fp8_TP8_Non_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_torch" output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file( @@ -627,9 +645,9 @@ class BenchmarkLlama3_1_405B(BaseBenchmarkTest): def setUp(self): super().setUp() # TODO: add numpy files to Azure and download from it - self.artifacts_dir = Path("/data/llama-3.1/weights/405b") + self.artifacts_dir = Path("/data/llama3.1/weights/405b") self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa" - self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa" + self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa" self.tensor_parallelism_size = 8 self.dir_path_405b = self.dir_path / "llama-405b" self.temp_dir_405b = Path(self.dir_path_405b) @@ -667,6 +685,9 @@ def setUp(self): tensor_parallelism_size=self.tensor_parallelism_size, ) self.prefill_args_f16 = self.artifacts_dir / "prefill_args" + self.prefill_args_bs4_128_in_tokens_f16 = ( + self.artifacts_dir / "prefill_args_bs4_128" + ) self.decode_args_f16 = self.artifacts_dir / "decode_args" self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8" self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8" @@ -678,6 +699,14 @@ def setUp(self): f"--input=@{self.prefill_args_f16}/cache_state_f16.npy", "--benchmark_repetitions=3", ] + self.iree_run_prefill_nondecomposed_args_fp16 = [ + "--function=prefill_bs4", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy", + f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy", + "--benchmark_repetitions=3", + ] self.iree_run_decode_args = [ "--function=decode_bs4", f"--input=@{self.decode_args_f16}/tokens.npy", @@ -799,9 +828,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark405B_fp8_TP8_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_decomposed" output_mlir = self.llama405b_fp8_decomposed_artifacts.create_file( @@ -847,9 +874,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self): cwd=self.repo_root, ) - @pytest.mark.xfail( - reason="Test not yet implemented", strict=True, raises=ExportMlirException - ) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark405B_fp8_TP8_Non_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_torch" output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file( From 350ff4747f0a1af1ba91f2c7057efc51e19a7a91 Mon Sep 17 00:00:00 2001 From: aviator19941 Date: Tue, 19 Nov 2024 19:24:26 -0600 Subject: [PATCH 12/12] Update flags and github pages publish_dir and destination_dir Signed-off-by: aviator19941 --- .github/workflows/ci-llama-large-tests.yaml | 4 +++- .../tests/models/llama/benchmark_amdgpu_test.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml index 649e3d173..db658f801 100644 --- a/.github/workflows/ci-llama-large-tests.yaml +++ b/.github/workflows/ci-llama-large-tests.yaml @@ -81,7 +81,9 @@ jobs: uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 with: github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} - publish_dir: ./out + publish_dir: ./out/llm/llama/benchmarks + destination_dir: ./llm/llama/benchmarks + keep_files: true - name: Upload llama executable files uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index c06d94f6a..125a0cfdc 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -55,9 +55,11 @@ def setUp(self): "--iree-opt-aggressively-propagate-transposes=true", "--iree-opt-data-tiling=false", "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'", - "--iree-hal-force-indirect-command-buffers=true", "--iree-stream-resource-memory-model=discrete", "--iree-hip-legacy-sync=false", + "--iree-hal-indirect-command-buffers=true", + "--iree-hal-memoization=true", + "--iree-opt-strip-assertions", ] @@ -446,11 +448,6 @@ def setUp(self): f"--input=@{self.decode_args_fp8}/cache_state_f16.npy", "--benchmark_repetitions=3", ] - self.compile_args += [ - "--iree-hal-force-indirect-command-buffers=true", - "--iree-stream-resource-memory-model=discrete", - "--iree-hip-legacy-sync=false", - ] @pytest.mark.xfail( reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException @@ -512,6 +509,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self): output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) + self.llama70b_f16_torch_sdpa_artifacts.attention_kernel = "torch" output_shard_file_name = ( self.artifacts_dir / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" @@ -794,6 +792,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): output_vmfb = self.llama405b_f16_torch_sdpa_artifacts.create_file( suffix=".vmfb", prefix=output_file_name ) + self.llama405b_f16_torch_sdpa_artifacts.attention_kernel = "torch" output_shard_file_name = ( self.artifacts_dir / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa" @@ -803,6 +802,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self): export_return_code = self.llama405b_f16_torch_sdpa_artifacts.export_to_mlir( mlir_path=output_mlir, json_path=output_json, + skip_decode=True, ) self.llama405b_f16_torch_sdpa_artifacts.compile_to_vmfb( mlir_path=str(output_mlir),