Integrate distributed inference with chat/server #3392
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: pull | |
on: | |
pull_request: | |
push: | |
branches: | |
- main | |
workflow_dispatch: | |
jobs: | |
gather-models-cpu: | |
runs-on: ubuntu-22.04 | |
outputs: | |
models: ${{ steps.gather-models-cpu.outputs.models }} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'false' | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Extract the list of models to run on CPU | |
id: gather-models-cpu | |
run: | | |
set -eux | |
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu" | |
test-cpu-compile: | |
name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile" | |
test-cpu-aoti: | |
name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti" | |
test-cpu-eval-sanity-check: | |
name: test-cpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-bfloat16" | |
test-cpu-eval-sanity-check-float16: | |
name: test-cpu-eval-sanity-check-float16 (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float16" | |
test-cpu-eval-sanity-check-float32: | |
name: test-cpu-eval-sanity-check-float32 (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32" | |
gather-models-gpu: | |
runs-on: ubuntu-22.04 | |
outputs: | |
models: ${{ steps.gather-models-gpu.outputs.models }} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'false' | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Extract the list of models to run on GPU | |
id: gather-models-gpu | |
run: | | |
set -eux | |
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu" | |
test-gpu-compile: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
./install/install_requirements.sh cuda | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile" | |
echo "::endgroup::" | |
test-gpu-aoti-bfloat16: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
timeout: 60 | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install newer objcopy that supports --set-section-alignment" | |
yum install -y devtoolset-10-binutils | |
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
./install/install_requirements.sh cuda | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16" | |
echo "::endgroup::" | |
test-gpu-aoti-float32: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install newer objcopy that supports --set-section-alignment" | |
yum install -y devtoolset-10-binutils | |
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
./install/install_requirements.sh cuda | |
pip list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float32" | |
echo "::endgroup::" | |
echo "::group::Run inference with quantize file" | |
if [ $(uname -s) != Darwin ]; then | |
python3 torchchat.py generate --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth" | |
fi | |
echo "::endgroup::" | |
test-gpu-aoti-float16: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install newer objcopy that supports --set-section-alignment" | |
yum install -y devtoolset-10-binutils | |
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
./install/install_requirements.sh cuda | |
pip list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float16" | |
echo "::endgroup::" | |
echo "::group::Run inference with quantize file" | |
if [ $(uname -s) == Darwin ]; then | |
python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth" | |
python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~ | |
fi | |
echo "::endgroup::" | |
test-gpu-eval-sanity-check: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install newer objcopy that supports --set-section-alignment" | |
yum install -y devtoolset-10-binutils | |
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
./install/install_requirements.sh cuda | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run eval" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "eval_sanity_check" | |
echo "::endgroup::" | |
test-tinystories-executorch: | |
strategy: | |
matrix: | |
runner: [16-core-ubuntu, macos-14-xlarge] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.10.11' | |
- name: Setup Xcode | |
if: runner.os == 'macOS' | |
uses: maxim-lobanov/setup-xcode@v1 | |
with: | |
xcode-version: '15.3' | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Intalling pip3 packages" | |
./install/install_requirements.sh | |
export TORCHCHAT_ROOT=$PWD | |
./torchchat/utils/scripts/install_et.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")' | |
cd ../.. | |
echo "Inside: ${PWD}" | |
- name: Download checkpoints | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
mkdir gguf_files | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model | |
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true" | |
wget -O ${GGUF_TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
- name: Run inference | |
run: | | |
export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --device cpu | |
python torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte | |
echo "Tests complete." | |
- name: Run inference | |
run: | | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
echo "******************************************" | |
echo "*** vanilla ***" | |
echo "******************************************" | |
python torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "*** --quantize torchchat/quant_config/mobile.json ***" | |
echo "******************************************" | |
# python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "******* Emb: channel-wise quantized ******" | |
echo "******************************************" | |
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "******** Emb: group-wise quantized *******" | |
echo "******************************************" | |
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "**** Emb 4bit: channel-wise quantized ****" | |
echo "******************************************" | |
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "****** Emb 4bit: group-wise quantized ****" | |
echo "******************************************" | |
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "******* INT8 channel-wise quantized ******" | |
echo "******************************************" | |
python torchchat.py export --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "******** INT8 group-wise quantized *******" | |
echo "******************************************" | |
python torchchat.py export --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "******** ET: a8w4dq INT4 group-wise quantized *******" | |
echo "******************************************" | |
python torchchat.py export --quant '{"linear:a8w4dq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "******************************************" | |
echo "******** INT4 group-wise quantized *******" | |
echo "******************************************" | |
# python torchchat.py export --quant '{"linear:int4" : {"groupsize": 32}}' --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
# python3 torchchat.py generate --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
echo "tests complete" | |
echo "******************************************" | |
- name: Run GGUF export + inference | |
run: | | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model | |
python torchchat.py export --gguf-path ${GGUF_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte | |
python torchchat.py generate --gguf-path ${GGUF_PATH} --pte-path ${PWD}/${MODEL_NAME}.pte --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --max-new-tokens 20 | |
echo "Tests complete." | |
torchchat-command-load-test: | |
strategy: | |
matrix: | |
runner: [macos-14] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Installing pip3 packages" | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download Stories files | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
- name: Test generate | |
run: | | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
python3 torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 | |
echo "Tests complete." | |
- name: Test download | |
run: | | |
python torchchat.py list | |
python torchchat.py download stories15m | |
python torchchat.py generate stories15M --device cpu | |
python torchchat.py remove stories15m | |
test-mps: | |
strategy: | |
matrix: | |
runner: [macos-m1-stable ] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.10.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Run test | |
run: | | |
export PYTHON_VERSION="3.10" | |
set -x | |
# NS/MC: Remove previous installation of torch and torchao first | |
# as this script does not install anything into conda env but rather as system dep | |
pip3 uninstall -y torch || true | |
set -eou pipefail | |
pip3 uninstall -y torchao || true | |
set -eou pipefail | |
echo "::group::Print machine info" | |
uname -a | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
echo "::endgroup::" | |
echo "::group::Install requirements" | |
# Install requirements | |
./install/install_requirements.sh | |
ls -la | |
pwd | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoints" | |
( | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
) | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
python3 torchchat.py generate --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 | |
echo "************************************************************" | |
echo "*** embedding" | |
echo "************************************************************" | |
python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
echo "************************************************************" | |
echo "*** linear int8" | |
echo "************************************************************" | |
python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
echo "************************************************************" | |
echo "*** linear int4" | |
echo "************************************************************" | |
PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
test-gguf-util: | |
strategy: | |
matrix: | |
runner: [macos-14] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.10.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Intalling pip3 packages" | |
pip3 install gguf | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
git clone https://github.com/ggerganov/llama.cpp.git | |
pushd llama.cpp | |
make | |
popd | |
- name: Download GGUF files | |
run: | | |
mkdir gguf_files | |
wget -O gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true" | |
./llama.cpp/llama-quantize --allow-requantize gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf gguf_files/TinyLlama-1.1B-openorca.Q4_0.requant_F32.gguf F32 | |
- name: Load files | |
run: | | |
touch test.py | |
echo "from torchchat.utils.gguf_loader import test_by_to_float" >> test.py | |
echo "test_by_to_float(\"gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf\", \"gguf_files/TinyLlama-1.1B-openorca.Q4_0.requant_F32.gguf\")" >> test.py | |
cat test.py | |
python test.py | |
echo "Tests complete." | |
test-mps-dtype: | |
strategy: | |
matrix: | |
runner: [macos-m1-stable ] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.10.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Run test | |
run: | | |
export PYTHON_VERSION="3.10" | |
set -x | |
# NS/MC: Remove previous installation of torch and torchao first | |
# as this script does not install anything into conda env but rather as system dep | |
pip3 uninstall -y torch || true | |
set -eou pipefail | |
pip3 uninstall -y torchao || true | |
set -eou pipefail | |
echo "::group::Print machine info" | |
uname -a | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
echo "::endgroup::" | |
echo "::group::Install requirements" | |
# Install requirements | |
./install/install_requirements.sh | |
ls -la | |
pwd | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoints" | |
( | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
) | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
for DTYPE in float16 float32; do | |
# if [ $(uname -s) == Darwin ]; then | |
# export DTYPE=float16 | |
# fi | |
python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 | |
done | |
compile-gguf: | |
strategy: | |
matrix: | |
runner: [macos-14] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.10.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download GGUF | |
run: | | |
mkdir gguf_files | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export TOKENIZER_PATH=gguf_files/tokenizer.model | |
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true" | |
wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
- name: Run inference | |
run: | | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export TOKENIZER_PATH=gguf_files/tokenizer.model | |
export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf | |
export MODEL_DIR=/tmp | |
echo "******************************************" | |
echo "******* Embed: not quantized *************" | |
echo "******************************************" | |
echo "Running eager" | |
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu | |
echo "Running compiled" | |
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu | |
echo "******************************************" | |
echo "******* Emb: channel-wise quantized ******" | |
echo "******************************************" | |
echo "Running eager" | |
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu | |
echo "Running compiled" | |
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu | |
echo "******************************************" | |
echo "******** Emb: group-wise quantized *******" | |
echo "******************************************" | |
echo "Running eager" | |
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu | |
echo "Running compiled" | |
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu | |
echo "tests complete" | |
echo "******************************************" | |
runner-et: | |
strategy: | |
matrix: | |
runner: [16-core-ubuntu, macos-14-xlarge] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
with: | |
submodules: true | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.10.11 | |
- name: Setup Xcode | |
if: runner.os == 'macOS' | |
uses: maxim-lobanov/setup-xcode@v1 | |
with: | |
xcode-version: '15.3' | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install torchchat | |
run: | | |
echo "Intalling pip3 packages" | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Set ET git sha | |
id: setup-hash | |
run: | | |
export TORCHCHAT_ROOT=${PWD} | |
echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV" | |
- name: Load or install ET | |
id: install-et | |
uses: actions/cache@v4 | |
with: | |
path: | | |
./et-build | |
./torchchat/utils/scripts | |
key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }} | |
- if: ${{ steps.install-et.outputs.cache-hit != 'true' }} | |
continue-on-error: true | |
run: | | |
echo "Installing ExecuTorch" | |
bash torchchat/utils/scripts/install_et.sh | |
- name: Install ExecuTorch python | |
run: | | |
echo "Install ExecuTorch python" | |
export TORCHCHAT_ROOT=$PWD | |
export ET_BUILD_DIR="et-build" | |
ENABLE_ET_PYBIND="${1:-true}" | |
source "torchchat/utils/scripts/install_utils.sh" | |
install_executorch_python_libs $ENABLE_ET_PYBIND | |
- name: Install runner | |
run: | | |
echo "Installing runner" | |
bash torchchat/utils/scripts/build_native.sh et | |
- name: Run inference | |
run: | | |
python torchchat.py download stories15M | |
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
export PRMT="Once upon a time in a land far away" | |
python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}" --device cpu | |
python torchchat.py export stories15M --output-pte-path ./model.pte | |
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}" | |
for dtype in fp32 fp16 bf16; do | |
echo "Testing export + runner with dtype=$dtype" | |
python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte | |
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}" | |
done | |
echo "Tests complete." | |
runner-aoti: | |
strategy: | |
matrix: | |
runner: [16-core-ubuntu, macos-14-xlarge] | |
runs-on: ${{matrix.runner}} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
with: | |
submodules: true | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.10.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
bash torchchat/utils/scripts/build_native.sh aoti | |
- name: Download checkpoint | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
- name: Run inference | |
run: | | |
set -eou pipefail | |
export MODEL_DIR=${PWD}/checkpoints/stories15M | |
export PROMPT="Once upon a time in a land far away" | |
python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" --device cpu | |
for dtype in fp32 fp16 bf16 fast fast16; do | |
echo "Running export + runner with dtype=$dtype" | |
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-aoti-package-path /tmp/model.pt2 | |
./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}" | |
done | |
echo "Tests complete." | |
test-build-runner-et-android: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
with: | |
runner: linux.4xlarge | |
script: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
export TORCHCHAT_ROOT=${PWD} | |
pushd /tmp | |
wget https://dl.google.com/android/repository/android-ndk-r26c-linux.zip | |
unzip android-ndk-r26c-linux.zip | |
popd | |
export ANDROID_NDK=/tmp/android-ndk-r26c | |
# Pull submodules (re2, abseil) for Tiktoken | |
git submodule sync | |
git submodule update --init | |
./runner/build_android.sh | |
echo "Tests complete." | |
test-torchao-experimental: | |
strategy: | |
matrix: | |
runner: [macos-14-xlarge] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
with: | |
submodules: true | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.10.11 | |
- name: Setup Xcode | |
if: runner.os == 'macOS' | |
uses: maxim-lobanov/setup-xcode@v1 | |
with: | |
xcode-version: '15.3' | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install torchchat | |
run: | | |
echo "Intalling pip3 packages" | |
./install/install_requirements.sh | |
pip3 list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Install torchao-ops | |
id: install-torchao-ops | |
run: | | |
bash torchchat/utils/scripts/build_torchao_ops.sh | |
- name: Install ET | |
run: | | |
echo "Installing ExecuTorch" | |
export TORCHCHAT_ROOT=${PWD} | |
bash torchchat/utils/scripts/install_et.sh | |
- name: Install runner | |
run: | | |
echo "Installing runner" | |
bash torchchat/utils/scripts/build_native.sh et link_torchao_ops | |
- name: Install runner AOTI | |
id: install-runner-aoti | |
run: | | |
bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops | |
- name: Run inference | |
run: | | |
python torchchat.py download stories110M | |
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
export PRMT="Once upon a time in a land far away" | |
echo "Generate eager" | |
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' | |
echo "Generate compile" | |
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile | |
echo "Export and run ET (C++ runner)" | |
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' | |
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}" | |
echo "Export and run AOTI (C++ runner)" | |
python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' | |
./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}" | |
echo "Generate AOTI" | |
python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}" | |
echo "Tests complete." |