Skip to content

Integrate distributed inference with chat/server #3392

Integrate distributed inference with chat/server

Integrate distributed inference with chat/server #3392

Workflow file for this run

name: pull
on:
pull_request:
push:
branches:
- main
workflow_dispatch:
jobs:
gather-models-cpu:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models-cpu.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Extract the list of models to run on CPU
id: gather-models-cpu
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu"
test-cpu-compile:
name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"
test-cpu-aoti:
name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"
test-cpu-eval-sanity-check:
name: test-cpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-bfloat16"
test-cpu-eval-sanity-check-float16:
name: test-cpu-eval-sanity-check-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float16"
test-cpu-eval-sanity-check-float32:
name: test-cpu-eval-sanity-check-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32"
gather-models-gpu:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models-gpu.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Extract the list of models to run on GPU
id: gather-models-gpu
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
test-gpu-compile:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install required packages"
./install/install_requirements.sh cuda
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
echo "::endgroup::"
test-gpu-aoti-bfloat16:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
echo "::group::Install required packages"
./install/install_requirements.sh cuda
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
echo "::endgroup::"
test-gpu-aoti-float32:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
echo "::group::Install required packages"
./install/install_requirements.sh cuda
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float32"
echo "::endgroup::"
echo "::group::Run inference with quantize file"
if [ $(uname -s) != Darwin ]; then
python3 torchchat.py generate --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
fi
echo "::endgroup::"
test-gpu-aoti-float16:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
echo "::group::Install required packages"
./install/install_requirements.sh cuda
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float16"
echo "::endgroup::"
echo "::group::Run inference with quantize file"
if [ $(uname -s) == Darwin ]; then
python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
fi
echo "::endgroup::"
test-gpu-eval-sanity-check:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
echo "::group::Install required packages"
./install/install_requirements.sh cuda
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run eval"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "eval_sanity_check"
echo "::endgroup::"
test-tinystories-executorch:
strategy:
matrix:
runner: [16-core-ubuntu, macos-14-xlarge]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.10.11'
- name: Setup Xcode
if: runner.os == 'macOS'
uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: '15.3'
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Intalling pip3 packages"
./install/install_requirements.sh
export TORCHCHAT_ROOT=$PWD
./torchchat/utils/scripts/install_et.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
cd ../..
echo "Inside: ${PWD}"
- name: Download checkpoints
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
mkdir gguf_files
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
wget -O ${GGUF_TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
- name: Run inference
run: |
export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --device cpu
python torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte
echo "Tests complete."
- name: Run inference
run: |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
echo "******************************************"
echo "*** vanilla ***"
echo "******************************************"
python torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "*** --quantize torchchat/quant_config/mobile.json ***"
echo "******************************************"
# python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "**** Emb 4bit: channel-wise quantized ****"
echo "******************************************"
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "****** Emb 4bit: group-wise quantized ****"
echo "******************************************"
python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "******* INT8 channel-wise quantized ******"
echo "******************************************"
python torchchat.py export --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "******** INT8 group-wise quantized *******"
echo "******************************************"
python torchchat.py export --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "******** ET: a8w4dq INT4 group-wise quantized *******"
echo "******************************************"
python torchchat.py export --quant '{"linear:a8w4dq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "******************************************"
echo "******** INT4 group-wise quantized *******"
echo "******************************************"
# python torchchat.py export --quant '{"linear:int4" : {"groupsize": 32}}' --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
# python3 torchchat.py generate --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
echo "tests complete"
echo "******************************************"
- name: Run GGUF export + inference
run: |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model
python torchchat.py export --gguf-path ${GGUF_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
python torchchat.py generate --gguf-path ${GGUF_PATH} --pte-path ${PWD}/${MODEL_NAME}.pte --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --max-new-tokens 20
echo "Tests complete."
torchchat-command-load-test:
strategy:
matrix:
runner: [macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: '3.10.11'
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Installing pip3 packages"
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download Stories files
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
- name: Test generate
run: |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
python3 torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0
python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0
echo "Tests complete."
- name: Test download
run: |
python torchchat.py list
python torchchat.py download stories15m
python torchchat.py generate stories15M --device cpu
python torchchat.py remove stories15m
test-mps:
strategy:
matrix:
runner: [macos-m1-stable ]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Run test
run: |
export PYTHON_VERSION="3.10"
set -x
# NS/MC: Remove previous installation of torch and torchao first
# as this script does not install anything into conda env but rather as system dep
pip3 uninstall -y torch || true
set -eou pipefail
pip3 uninstall -y torchao || true
set -eou pipefail
echo "::group::Print machine info"
uname -a
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
echo "::endgroup::"
echo "::group::Install requirements"
# Install requirements
./install/install_requirements.sh
ls -la
pwd
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoints"
(
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
)
echo "::endgroup::"
echo "::group::Run inference"
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
python3 torchchat.py generate --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
echo "************************************************************"
echo "*** embedding"
echo "************************************************************"
python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
echo "************************************************************"
echo "*** linear int8"
echo "************************************************************"
python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
echo "************************************************************"
echo "*** linear int4"
echo "************************************************************"
PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
test-gguf-util:
strategy:
matrix:
runner: [macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Intalling pip3 packages"
pip3 install gguf
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
git clone https://github.com/ggerganov/llama.cpp.git
pushd llama.cpp
make
popd
- name: Download GGUF files
run: |
mkdir gguf_files
wget -O gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
./llama.cpp/llama-quantize --allow-requantize gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf gguf_files/TinyLlama-1.1B-openorca.Q4_0.requant_F32.gguf F32
- name: Load files
run: |
touch test.py
echo "from torchchat.utils.gguf_loader import test_by_to_float" >> test.py
echo "test_by_to_float(\"gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf\", \"gguf_files/TinyLlama-1.1B-openorca.Q4_0.requant_F32.gguf\")" >> test.py
cat test.py
python test.py
echo "Tests complete."
test-mps-dtype:
strategy:
matrix:
runner: [macos-m1-stable ]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Run test
run: |
export PYTHON_VERSION="3.10"
set -x
# NS/MC: Remove previous installation of torch and torchao first
# as this script does not install anything into conda env but rather as system dep
pip3 uninstall -y torch || true
set -eou pipefail
pip3 uninstall -y torchao || true
set -eou pipefail
echo "::group::Print machine info"
uname -a
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
echo "::endgroup::"
echo "::group::Install requirements"
# Install requirements
./install/install_requirements.sh
ls -la
pwd
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoints"
(
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
)
echo "::endgroup::"
echo "::group::Run inference"
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
for DTYPE in float16 float32; do
# if [ $(uname -s) == Darwin ]; then
# export DTYPE=float16
# fi
python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0
PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
done
compile-gguf:
strategy:
matrix:
runner: [macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download GGUF
run: |
mkdir gguf_files
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export TOKENIZER_PATH=gguf_files/tokenizer.model
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
- name: Run inference
run: |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export TOKENIZER_PATH=gguf_files/tokenizer.model
export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf
export MODEL_DIR=/tmp
echo "******************************************"
echo "******* Embed: not quantized *************"
echo "******************************************"
echo "Running eager"
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu
echo "Running compiled"
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu
echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"
echo "Running eager"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu
echo "Running compiled"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu
echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"
echo "Running eager"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu
echo "Running compiled"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu
echo "tests complete"
echo "******************************************"
runner-et:
strategy:
matrix:
runner: [16-core-ubuntu, macos-14-xlarge]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
submodules: true
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Setup Xcode
if: runner.os == 'macOS'
uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: '15.3'
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install torchchat
run: |
echo "Intalling pip3 packages"
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Set ET git sha
id: setup-hash
run: |
export TORCHCHAT_ROOT=${PWD}
echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
- name: Load or install ET
id: install-et
uses: actions/cache@v4
with:
path: |
./et-build
./torchchat/utils/scripts
key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
- if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
continue-on-error: true
run: |
echo "Installing ExecuTorch"
bash torchchat/utils/scripts/install_et.sh
- name: Install ExecuTorch python
run: |
echo "Install ExecuTorch python"
export TORCHCHAT_ROOT=$PWD
export ET_BUILD_DIR="et-build"
ENABLE_ET_PYBIND="${1:-true}"
source "torchchat/utils/scripts/install_utils.sh"
install_executorch_python_libs $ENABLE_ET_PYBIND
- name: Install runner
run: |
echo "Installing runner"
bash torchchat/utils/scripts/build_native.sh et
- name: Run inference
run: |
python torchchat.py download stories15M
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
export PRMT="Once upon a time in a land far away"
python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}" --device cpu
python torchchat.py export stories15M --output-pte-path ./model.pte
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
for dtype in fp32 fp16 bf16; do
echo "Testing export + runner with dtype=$dtype"
python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
done
echo "Tests complete."
runner-aoti:
strategy:
matrix:
runner: [16-core-ubuntu, macos-14-xlarge]
runs-on: ${{matrix.runner}}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
submodules: true
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
bash torchchat/utils/scripts/build_native.sh aoti
- name: Download checkpoint
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
- name: Run inference
run: |
set -eou pipefail
export MODEL_DIR=${PWD}/checkpoints/stories15M
export PROMPT="Once upon a time in a land far away"
python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" --device cpu
for dtype in fp32 fp16 bf16 fast fast16; do
echo "Running export + runner with dtype=$dtype"
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-aoti-package-path /tmp/model.pt2
./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
done
echo "Tests complete."
test-build-runner-et-android:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.4xlarge
script: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
export TORCHCHAT_ROOT=${PWD}
pushd /tmp
wget https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
unzip android-ndk-r26c-linux.zip
popd
export ANDROID_NDK=/tmp/android-ndk-r26c
# Pull submodules (re2, abseil) for Tiktoken
git submodule sync
git submodule update --init
./runner/build_android.sh
echo "Tests complete."
test-torchao-experimental:
strategy:
matrix:
runner: [macos-14-xlarge]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
submodules: true
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Setup Xcode
if: runner.os == 'macOS'
uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: '15.3'
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install torchchat
run: |
echo "Intalling pip3 packages"
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Install torchao-ops
id: install-torchao-ops
run: |
bash torchchat/utils/scripts/build_torchao_ops.sh
- name: Install ET
run: |
echo "Installing ExecuTorch"
export TORCHCHAT_ROOT=${PWD}
bash torchchat/utils/scripts/install_et.sh
- name: Install runner
run: |
echo "Installing runner"
bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
- name: Install runner AOTI
id: install-runner-aoti
run: |
bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
- name: Run inference
run: |
python torchchat.py download stories110M
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
export PRMT="Once upon a time in a land far away"
echo "Generate eager"
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
echo "Generate compile"
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
echo "Export and run ET (C++ runner)"
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
echo "Export and run AOTI (C++ runner)"
python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
echo "Generate AOTI"
python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
echo "Tests complete."