Skip to content

Commit

Permalink
Restore PTQ tests for Llama2 (reopened) (NVIDIA#9064)
Browse files Browse the repository at this point in the history
* Restore PTQ tests for Llama2 (MR-9018)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* try not using release

Signed-off-by: eharper <eharper@nvidia.com>

* checkout v4

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: eharper <eharper@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
  • Loading branch information
janekl and ericharper authored May 1, 2024
1 parent 5d5919f commit e267406
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 11 deletions.
149 changes: 144 additions & 5 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ jobs:
apt-get update && apt-get install libsox-fmt-all -y && \
popd
# AMMO installation
pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
Expand Down Expand Up @@ -220,7 +223,26 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"


L0_Setup_Test_Data_And_Models:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python -m tests.setup --save_dir /home/TestData/nlp
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

## - name: L2: Multimodal Imagen Train

Expand All @@ -243,10 +265,9 @@ jobs:
uses: actions/checkout@v4
- run: |
CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
--input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
--output_path=/home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo \
--input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
--output_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
--precision=16
rm -f /home/TestData/nlp/megatron_llama/llama-ci-hf/llama_ci.nemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

Expand Down Expand Up @@ -322,6 +343,124 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_PTQ_Llama2_Export_Only:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_llama_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.algorithm=null \
model_save=/home/TestData/nlp/megatron_llama/ci_baseline
rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_PTQ_Llama2_FP8:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_llama_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
tensor_model_parallel_size=2 \
trainer.devices=2 \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=fp8 \
quantization.num_calib_size=8 \
inference.batch_size=2 \
export.inference_tensor_parallel=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_PTQ_Llama2_INT8_SQ:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_llama_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int8_sq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_PTQ_Llama2_INT4_AWQ:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
python examples/nlp/language_modeling/megatron_llama_quantization.py \
model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
tensor_model_parallel_size=1 \
trainer.devices=1 \
quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
quantization.algorithm=int4_awq \
quantization.num_calib_size=8 \
inference.batch_size=2 \
model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

# L2: ASR dev run
ASR_dev_run_Speech_to_Text:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -4664,7 +4803,7 @@ jobs:
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v2
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
Expand Down
4 changes: 1 addition & 3 deletions nemo/export/quantize/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import tarfile
from contextlib import nullcontext
from typing import List, Optional
Expand All @@ -21,7 +20,6 @@
import torch.distributed as dist
from megatron.core import parallel_state
from megatron.core.transformer.module import Float16Module
from megatron.training.utils import unwrap_model
from omegaconf import OmegaConf
from omegaconf.omegaconf import DictConfig, open_dict
from pytorch_lightning.trainer.trainer import Trainer
Expand All @@ -31,7 +29,7 @@
from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
from nemo.utils import logging
from nemo.utils.distributed import temporary_directory
from nemo.utils.model_utils import load_config, save_artifacts
from nemo.utils.model_utils import load_config, save_artifacts, unwrap_model

try:
import ammo.torch.quantization as atq
Expand Down
20 changes: 19 additions & 1 deletion nemo/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from enum import Enum
from functools import lru_cache
from pathlib import Path
from typing import List, Optional, Tuple, Union
from typing import List, Optional, Tuple, Type, Union

import wrapt

Expand Down Expand Up @@ -92,6 +92,24 @@ def load_config(model_file: str) -> DictConfig:
return model_config


def unwrap_model(model, module_instances: Union[Type, Tuple[Type]]):
"""Unwrap model from wrapper classes like Float16Module, for example."""

# TODO: Import this from megatron.core once moved there from megatron.training.
return_list = True
if not isinstance(model, list):
model = [model]
return_list = False
unwrapped_model = []
for model_module in model:
while isinstance(model_module, module_instances):
model_module = model_module.module
unwrapped_model.append(model_module)
if not return_list:
return unwrapped_model[0]
return unwrapped_model


def param_is_not_shared(param):
return not hasattr(param, 'shared') or not param.shared

Expand Down
4 changes: 2 additions & 2 deletions tests/setup/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
)

create_hf_model(
model_name_or_path="/home/TestData/nlp/meta-llama/Llama-2-7b-hf",
output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf"),
model_name_or_path="/home/TestData/nlp/megatron_llama/llama-ci-hf",
output_dir=os.path.join(args.save_dir, "megatron_llama/llama-ci-hf-tiny"),
config_updates={"hidden_size": 256, "num_attention_heads": 4, "num_hidden_layers": 2, "num_key_value_heads": 4},
overwrite=args.overwrite,
)
Expand Down

0 comments on commit e267406

Please sign in to comment.