Use iree_device to run tests on different hip devices to avoid confli…

…ct/queue times (#597) Use iree_device to run tests on different hip devices to avoid conflict/queue times --------- Signed-off-by: aviator19941 <avinash.sharma@amd.com>
nod-ai · Nov 28, 2024 · 1896d7a · 1896d7a
1 parent ba8dd7d
commit 1896d7a
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 35 deletions.
diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
@@ -76,7 +76,7 @@ jobs:
             iree-base-runtime
 
       - name: Run llama tests
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/llm/llama/benchmark/index.html
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --iree-device=hip://7 --html=out/llm/llama/benchmark/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

diff --git a/.github/workflows/ci-llama-quick-tests.yaml b/.github/workflows/ci-llama-quick-tests.yaml
@@ -76,7 +76,7 @@ jobs:
             iree-base-runtime
 
       - name: Run llama 8b f16 decomposed test
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --iree-device=hip://0 --run-quick-llama-test
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
@@ -270,9 +270,12 @@ def iree_benchmark_vmfb(
                 f"--device=hip://{i}" for i in range(self.tensor_parallelism_size)
             ]
         else:
-            rocr_visible_devices = [f"ROCR_VISIBLE_DEVICES={hip_device_id}"]
+            hip_device_arg = int(hip_device_id.split("://")[1])
+            rocr_visible_devices = [
+                f"ROCR_VISIBLE_DEVICES={','.join(str(i) for i in range(hip_device_arg + 1))}"
+            ]
             params = [f"--parameters=model={irpa_path}"]
-            devices = [f"--device=hip://{hip_device_id}"]
+            devices = [f"--device={hip_device_id}"]
         benchmark_args += rocr_visible_devices
         benchmark_args += [
             "iree-benchmark-module",

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -48,7 +48,6 @@ def setUpClass(cls):
             cls.directory_created = True
 
     def setUp(self):
-        self.hip_device_id = os.getenv("HIP_DEVICE_ID", default="0")
         self.compile_args = [
             "--iree-dispatch-creation-enable-aggressive-fusion=true",
             "--iree-global-opt-propagate-transposes=true",
@@ -181,15 +180,15 @@ def testBenchmark8B_f16_Decomposed(self):
         )
         # benchmark prefill
         self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama8b_f16_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_decode_args,
@@ -223,7 +222,7 @@ def testBenchmark8B_f16_Non_Decomposed_Prefill(self):
         )
         # benchmark prefill
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_nondecomposed_args_fp16,
@@ -257,15 +256,15 @@ def testBenchmark8B_f16_Non_Decomposed(self):
         )
         # benchmark prefill
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama8b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_decode_args,
@@ -297,15 +296,15 @@ def testBenchmark8B_fp8_Decomposed(self):
         )
         # benchmark prefill
         self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama8b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,
@@ -337,15 +336,15 @@ def testBenchmark8B_fp8_Non_Decomposed(self):
         )
         # benchmark prefill
         self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama8b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,
@@ -481,15 +480,15 @@ def testBenchmark70B_f16_TP8_Decomposed(self):
         )
         # benchmark prefill
         self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama70b_f16_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_decode_args,
@@ -528,22 +527,24 @@ def testBenchmark70B_f16_TP8_Non_Decomposed(self):
         )
         # benchmark prefill
         self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_decode_args,
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
+    @pytest.mark.xfail(
+        reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
+    )
     def testBenchmark70B_fp8_TP8_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_decomposed"
         output_mlir = self.llama70b_fp8_decomposed_artifacts.create_file(
@@ -574,22 +575,24 @@ def testBenchmark70B_fp8_TP8_Decomposed(self):
         )
         # benchmark prefill
         self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama70b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,
             cwd=self.repo_root,
         )
 
-    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
+    @pytest.mark.xfail(
+        reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
+    )
     def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_torch"
         output_mlir = self.llama70b_fp8_torch_sdpa_artifacts.create_file(
@@ -603,7 +606,7 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
         )
         output_shard_file_name = (
             self.artifacts_dir
-            / f"f8/tp8/llama3.1_70b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
+            / f"f8/tp8/llama3.1_70b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
             self.irpa_path = output_shard_file_name
@@ -620,15 +623,15 @@ def testBenchmark70B_fp8_TP8_Non_Decomposed(self):
         )
         # benchmark prefill
         self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama70b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,
@@ -764,15 +767,15 @@ def testBenchmark405B_f16_TP8_Decomposed(self):
         )
         # benchmark prefill
         self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama405b_f16_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_decode_args,
@@ -814,15 +817,15 @@ def testBenchmark405B_f16_TP8_Non_Decomposed(self):
         )
         # benchmark prefill
         self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama405b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_decode_args,
@@ -845,7 +848,7 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
         )
         output_shard_file_name = (
             self.artifacts_dir
-            / f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
+            / f"f8/tp8/llama3.1_405b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
             self.irpa_path = output_shard_file_name
@@ -862,15 +865,15 @@ def testBenchmark405B_fp8_TP8_Decomposed(self):
         )
         # benchmark prefill
         self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama405b_fp8_decomposed_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,
@@ -893,7 +896,7 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
         )
         output_shard_file_name = (
             self.artifacts_dir
-            / f"f8/tp8/llama3.1_405b_f8_tp{self.tensor_parallelism_size}_parameters.irpa"
+            / f"f8/tp8/llama3.1_405b_fp8_tp{self.tensor_parallelism_size}_parameters.irpa"
         )
         if output_shard_file_name.exists():
             self.irpa_path = output_shard_file_name
@@ -910,15 +913,15 @@ def testBenchmark405B_fp8_TP8_Non_Decomposed(self):
         )
         # benchmark prefill
         self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_prefill_args,
             cwd=self.repo_root,
         )
         # benchmark decode
         self.llama405b_fp8_torch_sdpa_artifacts.iree_benchmark_vmfb(
-            hip_device_id=self.hip_device_id,
+            hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path_fp8,
             args=self.iree_run_decode_args,