Update tests, TODO: decomposed failing with OUT OF RANGE command buff…

…er validation Signed-off-by: aviator19941 <avinash.sharma@amd.com>
nod-ai · Nov 19, 2024 · 1af1efc · 1af1efc
1 parent 2244b9e
commit 1af1efc
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 26 deletions.
diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
@@ -72,7 +72,7 @@ jobs:
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
-            iree-base-runtime \
+            iree-base-runtime
 
       - name: Run llama tests
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html

diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
@@ -73,7 +73,7 @@ jobs:
           # Test with nightly releases, not what iree-turbine uses.
           pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
             iree-base-compiler \
-            iree-base-runtime \
+            iree-base-runtime
 
       - name: Run llama 8b f16 decomposed test
         run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -55,6 +55,9 @@ def setUp(self):
             "--iree-opt-aggressively-propagate-transposes=true",
             "--iree-opt-data-tiling=false",
             "--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))'",
+            "--iree-hal-force-indirect-command-buffers=true",
+            "--iree-stream-resource-memory-model=discrete",
+            "--iree-hip-legacy-sync=false",
         ]
 
 
@@ -63,9 +66,9 @@ class BenchmarkLlama3_1_8B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/data/llama-3.1/weights/8b")
+        self.artifacts_dir = Path("/data/llama3.1/weights/8b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_8b_fp16.irpa"
-        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama8b_fp8.irpa"
+        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_8b_fp8.irpa"
         self.tensor_parallelism_size = 1
         self.dir_path_8b = self.dir_path / "llama-8b"
         self.temp_dir_8b = Path(self.dir_path_8b)
@@ -103,6 +106,9 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.prefill_args_bs4_128_in_tokens_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128"
+        )
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -114,6 +120,14 @@ def setUp(self):
             f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.iree_run_prefill_nondecomposed_args_fp16 = [
+            "--function=prefill_bs4",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
         self.iree_run_decode_args = [
             "--function=decode_bs4",
             f"--input=@{self.decode_args_f16}/tokens.npy",
@@ -181,6 +195,7 @@ def testBenchmark8B_f16_Decomposed(self):
         )
 
     @skipif_run_quick_llama_test
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
         output_file_name = self.dir_path_8b / "f16_torch_prefill"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
@@ -210,7 +225,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
             hip_device_id=self.hip_device_id,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
-            args=self.iree_run_prefill_args,
+            args=self.iree_run_prefill_nondecomposed_args_fp16,
             cwd=self.repo_root,
         )
 
@@ -256,9 +271,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_fp8_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_decomposed"
         output_mlir = self.llama8b_fp8_decomposed_artifacts.create_file(
@@ -298,9 +311,7 @@ def testBenchmark8B_fp8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Compile failure", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_fp8_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_torch"
         output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
@@ -347,7 +358,7 @@ class BenchmarkLlama3_1_70B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/data/llama-3.1/weights/70b")
+        self.artifacts_dir = Path("/data/llama3.1/weights/70b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_70b_f16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama70b_fp8.irpa"
         self.tensor_parallelism_size = 8
@@ -387,6 +398,9 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.prefill_args_bs4_128_in_tokens_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128"
+        )
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -398,6 +412,14 @@ def setUp(self):
             f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.iree_run_prefill_nondecomposed_args_fp16 = [
+            "--function=prefill_bs4",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
         self.iree_run_decode_args = [
             "--function=decode_bs4",
             f"--input=@{self.decode_args_f16}/tokens.npy",
@@ -524,9 +546,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark70B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_decomposed"
         output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file(
@@ -572,9 +592,7 @@ def testBenchmark70B_fp8_TP8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_torch"
         output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
@@ -627,9 +645,9 @@ class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/data/llama-3.1/weights/405b")
+        self.artifacts_dir = Path("/data/llama3.1/weights/405b")
         self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
-        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama405b_fp8.irpa"
+        self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa"
         self.tensor_parallelism_size = 8
         self.dir_path_405b = self.dir_path / "llama-405b"
         self.temp_dir_405b = Path(self.dir_path_405b)
@@ -667,6 +685,9 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
         )
         self.prefill_args_f16 = self.artifacts_dir / "prefill_args"
+        self.prefill_args_bs4_128_in_tokens_f16 = (
+            self.artifacts_dir / "prefill_args_bs4_128"
+        )
         self.decode_args_f16 = self.artifacts_dir / "decode_args"
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -678,6 +699,14 @@ def setUp(self):
             f"--input=@{self.prefill_args_f16}/cache_state_f16.npy",
             "--benchmark_repetitions=3",
         ]
+        self.iree_run_prefill_nondecomposed_args_fp16 = [
+            "--function=prefill_bs4",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/random_tokens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_lens.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/seq_block_ids.npy",
+            f"--input=@{self.prefill_args_bs4_128_in_tokens_f16}/cs_f16.npy",
+            "--benchmark_repetitions=3",
+        ]
         self.iree_run_decode_args = [
             "--function=decode_bs4",
             f"--input=@{self.decode_args_f16}/tokens.npy",
@@ -799,9 +828,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark405B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_decomposed"
         output_mlir = self.llama405b_fp8_decomposed_artifacts.create_file(
@@ -847,9 +874,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
-    )
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_torch"
         output_mlir = self.llama405b_fp8_torch_sdpa_artifacts.create_file(