Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prefill 8B f16 torch sdpa test, update tests with compile flags and tp flags, with nightly iree #456

Merged
merged 12 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions .github/workflows/ci-llama-large-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,26 @@ jobs:
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/

# Install latest iree-tubrine.
# Install latest iree-turbine.
pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"

# Test with pinned nightly releases, not what iree-turbine uses.
pip install -f https://iree.dev/pip-release-links.html --upgrade \
iree-base-compiler==3.0.0rc20241118 \
iree-base-runtime==3.0.0rc20241118

# Test with nightly releases, not what iree-turbine uses.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler \
iree-base-runtime

- name: Run llama tests
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-all-llama --iree-hip-target=gfx942 --html=out/index.html
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --run-nightly-llama-tests --iree-hip-target=gfx942 --html=out/index.html

- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
with:
github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
publish_dir: ./out
publish_dir: ./out/llm/llama/benchmarks
destination_dir: ./llm/llama/benchmarks
keep_files: true

- name: Upload llama executable files
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
Expand Down
15 changes: 8 additions & 7 deletions .github/workflows/ci-llama-quick-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,18 @@ jobs:
pip install --no-compile -r pytorch-cpu-requirements.txt
pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/

# Install latest iree-tubrine.
# Install latest iree-turbine.
pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"

# Test with pinned nightly releases, not what iree-turbine uses.
pip install -f https://iree.dev/pip-release-links.html --upgrade \
iree-base-compiler==3.0.0rc20241118 \
iree-base-runtime==3.0.0rc20241118

- name: Run llama 8b tests
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-8b-llama
# Test with nightly releases, not what iree-turbine uses.
pip install -f https://iree.dev/pip-release-links.html --upgrade --pre \
iree-base-compiler \
iree-base-runtime

- name: Run llama 8b f16 decomposed test
run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --iree-hip-target=gfx942 --run-quick-llama-test

- name: Upload llama executable files
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
Expand Down
10 changes: 5 additions & 5 deletions sharktank/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,17 @@ def pytest_addoption(parser):
)

parser.addoption(
"--run-8b-llama",
"--run-quick-llama-test",
action="store_true",
dest="run-8b-llama",
dest="run-quick-llama-test",
default=False,
help="Enable llama 8b benchmarking tests",
help="Enable llama 8b f16 decomposed benchmarking test",
)

parser.addoption(
"--run-all-llama",
"--run-nightly-llama-tests",
action="store_true",
dest="run-all-llama",
dest="run-nightly-llama-tests",
default=False,
help="Enable all llama benchmarking tests",
)
Expand Down
4 changes: 3 additions & 1 deletion sharktank/sharktank/examples/export_paged_llm_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def main():
help="Enables strictness during export",
action="store_true",
)

cli.add_quantization_options(parser)
cli.add_model_options(parser)
args = cli.parse(parser)
Expand Down Expand Up @@ -312,7 +313,8 @@ def _(
bsizes = []
for bs in args.bs:
generate_batch_prefill(bs)
generate_batch_decode(bs)
if not args.skip_decode:
generate_batch_decode(bs)
bsizes.append(bs)
config = generate_params_json(hp, bsizes, bsizes)
print("GENERATED!")
Expand Down
5 changes: 5 additions & 0 deletions sharktank/sharktank/utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ def add_model_options(parser: argparse.ArgumentParser):
default="decomposed",
choices=["decomposed", "torch"],
)
parser.add_argument(
"--skip-decode",
help="Enables prefill only, skips decode",
action="store_true",
)


def add_quantization_options(parser: argparse.ArgumentParser):
Expand Down
14 changes: 10 additions & 4 deletions sharktank/sharktank/utils/export_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ def wrapper(*args, **kwargs):
def shard_irpa_file(
self,
*,
gguf_file: str,
irpa_file: str,
output_irpa: str,
):
shard_irpa_args = [
"python3",
"-m",
"sharktank.examples.sharding.shard_llm_dataset",
"--gguf-file",
gguf_file,
"--irpa-file",
irpa_file,
"--output-irpa-file",
output_irpa,
"--tensor-parallelism-size",
Expand Down Expand Up @@ -160,6 +160,7 @@ def export_to_mlir(
*,
mlir_path: str,
json_path: str,
skip_decode: Optional[bool] = None,
):
export_args = [
"python3",
Expand All @@ -170,6 +171,8 @@ def export_to_mlir(
f"--output-config={json_path}",
f"--bs={str(self.batch_size)}",
]
if skip_decode:
export_args.append("--skip-decode")
if self.attention_kernel in ["decomposed", "torch"]:
export_args.append("--attention-kernel")
export_args.append(self.attention_kernel)
Expand All @@ -195,6 +198,7 @@ def compile_to_vmfb(
vmfb_path,
cwd,
hal_dump_path: Optional[Path] = None,
args: Optional[List[str]] = None,
):
# TODO: Control flag to enable multiple backends
compile_args = [
Expand All @@ -214,7 +218,9 @@ def compile_to_vmfb(
compile_args += [
f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"
]

# Append optional arguments if provided
if args:
compile_args += args
cmd = subprocess.list2cmdline(compile_args)

logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}")
Expand Down
14 changes: 14 additions & 0 deletions sharktank/tests/models/llama/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# How to run Llama 3.1 Benchmarking Tests
In order to run Llama 3.1 8B F16 Decomposed test:
```
pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \
--run-quick-test --iree-hip-target=gfx942
```

In order to filter by test, use the -k option. If you
wanted to only run the Llama 3.1 70B F16 Decomposed test:
```
pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s \
--run-nightly-llama-tests --iree-hip-target=gfx942 \
-k 'testBenchmark70B_f16_TP8_Decomposed'
```
Loading
Loading