Integrate distributed inference with chat/server #3392

Workflow file for this run

	name: pull

	on:
	pull_request:
	push:
	branches:
	- main
	workflow_dispatch:

	jobs:
	gather-models-cpu:
	runs-on: ubuntu-22.04
	outputs:
	models: ${{ steps.gather-models-cpu.outputs.models }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Extract the list of models to run on CPU
	id: gather-models-cpu
	run: \|
	set -eux
	PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu"
	test-cpu-compile:
	name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"

	test-cpu-aoti:
	name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"

	test-cpu-eval-sanity-check:
	name: test-cpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-bfloat16"

	test-cpu-eval-sanity-check-float16:
	name: test-cpu-eval-sanity-check-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float16"

	test-cpu-eval-sanity-check-float32:
	name: test-cpu-eval-sanity-check-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32"

	gather-models-gpu:
	runs-on: ubuntu-22.04
	outputs:
	models: ${{ steps.gather-models-gpu.outputs.models }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Extract the list of models to run on GPU
	id: gather-models-gpu
	run: \|
	set -eux
	PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
	test-gpu-compile:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install required packages"
	./install/install_requirements.sh cuda
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run inference"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
	echo "::endgroup::"

	test-gpu-aoti-bfloat16:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	timeout: 60
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install newer objcopy that supports --set-section-alignment"
	yum install -y devtoolset-10-binutils
	export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
	echo "::endgroup::"

	echo "::group::Install required packages"
	./install/install_requirements.sh cuda
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run inference"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
	echo "::endgroup::"

	test-gpu-aoti-float32:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install newer objcopy that supports --set-section-alignment"
	yum install -y devtoolset-10-binutils
	export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
	echo "::endgroup::"

	echo "::group::Install required packages"
	./install/install_requirements.sh cuda
	pip list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run inference"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float32"
	echo "::endgroup::"

	echo "::group::Run inference with quantize file"
	if [ $(uname -s) != Darwin ]; then
	python3 torchchat.py generate --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
	fi
	echo "::endgroup::"

	test-gpu-aoti-float16:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install newer objcopy that supports --set-section-alignment"
	yum install -y devtoolset-10-binutils
	export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
	echo "::endgroup::"

	echo "::group::Install required packages"
	./install/install_requirements.sh cuda
	pip list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run inference"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-float16"
	echo "::endgroup::"

	echo "::group::Run inference with quantize file"
	if [ $(uname -s) == Darwin ]; then
	python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
	python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
	fi
	echo "::endgroup::"

	test-gpu-eval-sanity-check:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install newer objcopy that supports --set-section-alignment"
	yum install -y devtoolset-10-binutils
	export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
	echo "::endgroup::"

	echo "::group::Install required packages"
	./install/install_requirements.sh cuda
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run eval"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "eval_sanity_check"
	echo "::endgroup::"

	test-tinystories-executorch:
	strategy:
	matrix:
	runner: [16-core-ubuntu, macos-14-xlarge]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: '3.10.11'
	- name: Setup Xcode
	if: runner.os == 'macOS'
	uses: maxim-lobanov/setup-xcode@v1
	with:
	xcode-version: '15.3'
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Intalling pip3 packages"
	./install/install_requirements.sh

	export TORCHCHAT_ROOT=$PWD
	./torchchat/utils/scripts/install_et.sh

	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'

	cd ../..
	echo "Inside: ${PWD}"
	- name: Download checkpoints
	run: \|
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd

	mkdir gguf_files
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model
	wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
	wget -O ${GGUF_TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model

	- name: Run inference
	run: \|
	export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M

	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --device cpu

	python torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte

	echo "Tests complete."

	- name: Run inference
	run: \|
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp

	echo "******************************************"
	echo "* vanilla *"
	echo "******************************************"
	python torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "* --quantize torchchat/quant_config/mobile.json *"
	echo "******************************************"
	# python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte


	echo "******************************************"
	echo "***** Emb: channel-wise quantized ****"
	echo "******************************************"
	python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "****** Emb: group-wise quantized *****"
	echo "******************************************"
	python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "** Emb 4bit: channel-wise quantized **"
	echo "******************************************"
	python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "**** Emb 4bit: group-wise quantized **"
	echo "******************************************"
	python torchchat.py export --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "***** INT8 channel-wise quantized ****"
	echo "******************************************"
	python torchchat.py export --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "****** INT8 group-wise quantized *****"
	echo "******************************************"
	python torchchat.py export --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "****** ET: a8w4dq INT4 group-wise quantized *****"
	echo "******************************************"
	python torchchat.py export --quant '{"linear:a8w4dq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "******************************************"
	echo "****** INT4 group-wise quantized *****"
	echo "******************************************"
	# python torchchat.py export --quant '{"linear:int4" : {"groupsize": 32}}' --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	# python3 torchchat.py generate --tokenizer-path ${TOKENIZER_PATH} --gguf-path ${GGUF_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte

	echo "tests complete"
	echo "******************************************"

	- name: Run GGUF export + inference
	run: \|
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model

	python torchchat.py export --gguf-path ${GGUF_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
	python torchchat.py generate --gguf-path ${GGUF_PATH} --pte-path ${PWD}/${MODEL_NAME}.pte --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --max-new-tokens 20

	echo "Tests complete."
	torchchat-command-load-test:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Installing pip3 packages"
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'

	- name: Download Stories files
	run: \|

	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd

	- name: Test generate
	run: \|

	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp

	python3 torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0
	python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0
	echo "Tests complete."

	- name: Test download
	run: \|

	python torchchat.py list
	python torchchat.py download stories15m
	python torchchat.py generate stories15M --device cpu
	python torchchat.py remove stories15m

	test-mps:
	strategy:
	matrix:
	runner: [macos-m1-stable ]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.10.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Run test
	run: \|
	export PYTHON_VERSION="3.10"
	set -x
	# NS/MC: Remove previous installation of torch and torchao first
	# as this script does not install anything into conda env but rather as system dep
	pip3 uninstall -y torch \|\| true
	set -eou pipefail

	pip3 uninstall -y torchao \|\| true
	set -eou pipefail

	echo "::group::Print machine info"
	uname -a
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	echo "::endgroup::"

	echo "::group::Install requirements"
	# Install requirements
	./install/install_requirements.sh
	ls -la
	pwd
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	(
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	)
	echo "::endgroup::"

	echo "::group::Run inference"
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp

	python3 torchchat.py generate --device mps --checkpoint-path ${MODEL_PATH} --temperature 0

	echo "************************************************************"
	echo "*** embedding"
	echo "************************************************************"

	python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
	python3 torchchat.py generate --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0

	echo "************************************************************"
	echo "*** linear int8"
	echo "************************************************************"

	python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0
	python3 torchchat.py generate --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0

	echo "************************************************************"
	echo "*** linear int4"
	echo "************************************************************"

	PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
	test-gguf-util:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.10.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Intalling pip3 packages"
	pip3 install gguf
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'

	git clone https://github.com/ggerganov/llama.cpp.git
	pushd llama.cpp
	make
	popd

	- name: Download GGUF files
	run: \|
	mkdir gguf_files
	wget -O gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
	./llama.cpp/llama-quantize --allow-requantize gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf gguf_files/TinyLlama-1.1B-openorca.Q4_0.requant_F32.gguf F32

	- name: Load files
	run: \|
	touch test.py
	echo "from torchchat.utils.gguf_loader import test_by_to_float" >> test.py
	echo "test_by_to_float(\"gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf\", \"gguf_files/TinyLlama-1.1B-openorca.Q4_0.requant_F32.gguf\")" >> test.py
	cat test.py
	python test.py

	echo "Tests complete."
	test-mps-dtype:
	strategy:
	matrix:
	runner: [macos-m1-stable ]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.10.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Run test
	run: \|
	export PYTHON_VERSION="3.10"
	set -x
	# NS/MC: Remove previous installation of torch and torchao first
	# as this script does not install anything into conda env but rather as system dep
	pip3 uninstall -y torch \|\| true
	set -eou pipefail

	pip3 uninstall -y torchao \|\| true
	set -eou pipefail

	echo "::group::Print machine info"
	uname -a
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	echo "::endgroup::"

	echo "::group::Install requirements"
	# Install requirements
	./install/install_requirements.sh
	ls -la
	pwd
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	(
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	)
	echo "::endgroup::"

	echo "::group::Run inference"
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp
	for DTYPE in float16 float32; do
	# if [ $(uname -s) == Darwin ]; then
	# export DTYPE=float16
	# fi

	python3 torchchat.py generate --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0

	python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0

	python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0

	python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0

	python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0

	PYTORCH_ENABLE_MPS_FALLBACK=1 python3 torchchat.py generate --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0
	done
	compile-gguf:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.10.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download GGUF
	run: \|
	mkdir gguf_files
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export TOKENIZER_PATH=gguf_files/tokenizer.model

	wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
	wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	- name: Run inference
	run: \|
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export TOKENIZER_PATH=gguf_files/tokenizer.model
	export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf
	export MODEL_DIR=/tmp

	echo "******************************************"
	echo "***** Embed: not quantized ***********"
	echo "******************************************"

	echo "Running eager"
	python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu

	echo "Running compiled"
	python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu

	echo "******************************************"
	echo "***** Emb: channel-wise quantized ****"
	echo "******************************************"

	echo "Running eager"
	python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu

	echo "Running compiled"
	python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu

	echo "******************************************"
	echo "****** Emb: group-wise quantized *****"
	echo "******************************************"

	echo "Running eager"
	python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu

	echo "Running compiled"
	python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu

	echo "tests complete"
	echo "******************************************"
	runner-et:
	strategy:
	matrix:
	runner: [16-core-ubuntu, macos-14-xlarge]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	with:
	submodules: true
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.10.11
	- name: Setup Xcode
	if: runner.os == 'macOS'
	uses: maxim-lobanov/setup-xcode@v1
	with:
	xcode-version: '15.3'
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install torchchat
	run: \|
	echo "Intalling pip3 packages"
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Set ET git sha
	id: setup-hash
	run: \|
	export TORCHCHAT_ROOT=${PWD}
	echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
	- name: Load or install ET
	id: install-et
	uses: actions/cache@v4
	with:
	path: \|
	./et-build
	./torchchat/utils/scripts
	key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
	- if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
	continue-on-error: true
	run: \|
	echo "Installing ExecuTorch"
	bash torchchat/utils/scripts/install_et.sh
	- name: Install ExecuTorch python
	run: \|
	echo "Install ExecuTorch python"
	export TORCHCHAT_ROOT=$PWD
	export ET_BUILD_DIR="et-build"
	ENABLE_ET_PYBIND="${1:-true}"
	source "torchchat/utils/scripts/install_utils.sh"
	install_executorch_python_libs $ENABLE_ET_PYBIND
	- name: Install runner
	run: \|
	echo "Installing runner"
	bash torchchat/utils/scripts/build_native.sh et
	- name: Run inference
	run: \|
	python torchchat.py download stories15M
	wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model

	export PRMT="Once upon a time in a land far away"

	python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}" --device cpu

	python torchchat.py export stories15M --output-pte-path ./model.pte
	./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"

	for dtype in fp32 fp16 bf16; do
	echo "Testing export + runner with dtype=$dtype"
	python torchchat.py export stories15M --dtype $dtype --output-pte-path ./model.pte
	./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
	done

	echo "Tests complete."
	runner-aoti:
	strategy:
	matrix:
	runner: [16-core-ubuntu, macos-14-xlarge]
	runs-on: ${{matrix.runner}}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	with:
	submodules: true
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'

	bash torchchat/utils/scripts/build_native.sh aoti

	- name: Download checkpoint
	run: \|
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	- name: Run inference
	run: \|
	set -eou pipefail

	export MODEL_DIR=${PWD}/checkpoints/stories15M
	export PROMPT="Once upon a time in a land far away"

	python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" --device cpu

	for dtype in fp32 fp16 bf16 fast fast16; do
	echo "Running export + runner with dtype=$dtype"
	python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-aoti-package-path /tmp/model.pt2
	./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
	done

	echo "Tests complete."

	test-build-runner-et-android:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	with:
	runner: linux.4xlarge
	script: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'

	export TORCHCHAT_ROOT=${PWD}
	pushd /tmp
	wget https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
	unzip android-ndk-r26c-linux.zip
	popd
	export ANDROID_NDK=/tmp/android-ndk-r26c

	# Pull submodules (re2, abseil) for Tiktoken
	git submodule sync
	git submodule update --init
	./runner/build_android.sh
	echo "Tests complete."

	test-torchao-experimental:
	strategy:
	matrix:
	runner: [macos-14-xlarge]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	with:
	submodules: true
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.10.11
	- name: Setup Xcode
	if: runner.os == 'macOS'
	uses: maxim-lobanov/setup-xcode@v1
	with:
	xcode-version: '15.3'
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install torchchat
	run: \|
	echo "Intalling pip3 packages"
	./install/install_requirements.sh
	pip3 list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Install torchao-ops
	id: install-torchao-ops
	run: \|
	bash torchchat/utils/scripts/build_torchao_ops.sh
	- name: Install ET
	run: \|
	echo "Installing ExecuTorch"
	export TORCHCHAT_ROOT=${PWD}
	bash torchchat/utils/scripts/install_et.sh
	- name: Install runner
	run: \|
	echo "Installing runner"
	bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
	- name: Install runner AOTI
	id: install-runner-aoti
	run: \|
	bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
	- name: Run inference
	run: \|
	python torchchat.py download stories110M
	wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	export PRMT="Once upon a time in a land far away"
	echo "Generate eager"
	python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
	echo "Generate compile"
	python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
	echo "Export and run ET (C++ runner)"
	python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
	./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
	echo "Export and run AOTI (C++ runner)"
	python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
	./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
	echo "Generate AOTI"
	python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
	echo "Tests complete."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Integrate distributed inference with chat/server #3392

Workflow file

Integrate distributed inference with chat/server #3392

Jobs

Run details

Workflow file for this run