Skip to content

Commit

Permalink
Merge branch 'main' into jiemingz/first_val_step
Browse files Browse the repository at this point in the history
  • Loading branch information
JimmyZhang12 authored Jan 23, 2024
2 parents 9da045a + a44b75d commit 44c9a52
Show file tree
Hide file tree
Showing 108 changed files with 5,354 additions and 1,014 deletions.
22 changes: 11 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,6 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_tor
else echo "Skipping failed torchaudio installation"; fi \
else echo "torchaudio installed successfully"; fi

# install nemo dependencies
WORKDIR /tmp/nemo
ENV LHOTSE_REQUIRE_TORCHAUDIO=0
COPY requirements .
RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done

# install flash attention
RUN pip install flash-attn
# install numba for latest containers
RUN pip install numba>=0.57.1

COPY scripts /tmp/nemo/scripts/
# install correct graphviz version (k2 and pynini visualization tool), skip if installation fails
RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \
Expand All @@ -133,6 +122,17 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL
else echo "Skipping failed k2 installation"; fi \
else echo "k2 installed successfully"; fi

# install nemo dependencies
WORKDIR /tmp/nemo
ENV LHOTSE_REQUIRE_TORCHAUDIO=0
COPY requirements .
RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done

# install flash attention
RUN pip install flash-attn
# install numba for latest containers
RUN pip install numba>=0.57.1

# copy nemo source into a scratch image
FROM scratch as nemo-src
COPY . .
Expand Down
16 changes: 8 additions & 8 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -3949,7 +3949,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
}
failFast true
steps {
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
Expand Down Expand Up @@ -3978,7 +3978,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]"
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=1 \
Expand Down Expand Up @@ -4054,7 +4054,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
failFast true
steps {
sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2"
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand Down Expand Up @@ -4089,7 +4089,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
failFast true
steps {
sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand All @@ -4111,7 +4111,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]"
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
model.peft.restore_from_ckpt_name=null \
Expand Down Expand Up @@ -4176,7 +4176,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
}
failFast true
steps{
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
model.peft.restore_from_path=null \
model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \
Expand Down Expand Up @@ -4995,7 +4995,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
failFast true
steps {
sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2"
sh "python examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py \
sh "python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand All @@ -5017,7 +5017,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]"
sh "python examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py \
sh "python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
model.peft.restore_from_ckpt_name=null \
Expand Down
3 changes: 3 additions & 0 deletions docs/source/asr/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,11 @@ You can easily convert your existing NeMo-compatible ASR datasets using the
--num_shards=<number of tarfiles that will contain the audio>
--max_duration=<float representing maximum duration of audio samples> \
--min_duration=<float representing minimum duration of audio samples> \
--force_codec=flac \
--shuffle --shuffle_seed=0
.. note:: For extra reduction of storage space at the cost of lossy (but high-quality) compression, you may use ``--force_codec=opus`` instead.

This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter
audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory
``--target_dir`` in ``n`` shards, along with separate manifest and metadata files.
Expand Down
22 changes: 21 additions & 1 deletion docs/source/core/exp_manager.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ via YAML or CLI:
Experiment Loggers
------------------

Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following
Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, simply set the following
via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.


Expand Down Expand Up @@ -153,6 +153,26 @@ ClearML
log_cfg: False # log config to clearml server
log_metrics: False # log metrics to clearml server
Neptune
~~~~~~~

.. _exp_manager_neptune-label:

.. code-block:: yaml
exp_manager:
...
create_checkpoint_callback: True
create_neptune_logger: false
neptune_logger_kwargs:
project: ${project}
name: ${name}
prefix: train
log_model_checkpoints: false # set to True if checkpoints need to be pushed to Neptune
tags: null # can specify as an array of strings in yaml array format
description: null
<Add any other arguments supported by Neptune logger here>
Exponential Moving Average
--------------------------

Expand Down
4 changes: 2 additions & 2 deletions docs/source/multimodal/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Model Classes
:members: __init__, configure_optimizers


.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
.. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
:show-inheritance:
:no-members:
:members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
Expand Down Expand Up @@ -49,7 +49,7 @@ Modules
:show-inheritance:
:no-members:

.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
.. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
:show-inheritance:
:no-members:
:members: __init__, encode, decode
Expand Down
2 changes: 1 addition & 1 deletion docs/source/multimodal/mllm/checkpoint.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ Adjust model parallelism with:
--target_tensor_model_parallel_size=??? \
--pipeline_model_parallel_size=??? \
--target_pipeline_model_parallel_size=??? \
--model_class="nemo.collections.multimodal.models.neva.neva_model.MegatronNevaModel" \
--model_class="nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel" \
--precision=32 \
--tokenizer_model_path=/path/to/tokenizer.model \
--tp_conversion_only
2 changes: 1 addition & 1 deletion docs/source/multimodal/text2img/insp2p.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Model Introduction

InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.

Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.

Training Dataset
--------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/source/multimodal/text2img/sd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ The VAE configuration is defined under **first_stage_config**.
.. code-block:: yaml
first_stage_config:
_target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
_target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
from_pretrained: /path/to/vae.bin
embed_dim: 4
monitor: val/rec_loss
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ inference:
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False
end_strings: ["<extra_id_1>","<extra_id_7>",] # generation will stop when one of these tokens is generated
images_base_path: /pwd/images
insert_image_token: null # `left` or `right` or `null`

trainer:
devices: 8
Expand All @@ -24,7 +25,7 @@ tensor_model_parallel_size: 8
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)
neva_model_file: /pwd/nemo_experiments/nemo_llava.nemo #neva_22b_tp8_finetuned_v1.nemo neva_8b_tp4_finetuned_v1.nemo
llm_model_file: null
base_model_file: null
checkpoint_dir: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/checkpoints # checkpoint file dir. This is used to load the PTL checkpoint generated during the Kosmos training
checkpoint_name: null #megatron_clip--val_loss=0.41-step=13499-consumed_samples=431904.0.ckpt # PTL checkpoint file name, only used for PTL checkpoint loading
hparams_file: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/version_0/hparams.yaml # model configuration file, only used for PTL checkpoint loading
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ model:

optim:
name: fused_adam
lr: 2e-5
lr: 2e-4
weight_decay: 0.
betas:
- 0.9
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
python convert_hf_llava_to_neva.py \
--in-file <path_to_hf_checkpoints_folder> \
--out-file <path_to_output_nemo_file> \
--tokenizer-model <path_to_sp_tokenizer_model>
--tokenizer-model <path_to_sp_tokenizer_model> \
--conv-template llama_2 # nvgpt, llama_2, v1 (vicuna)
"""

import os
Expand Down Expand Up @@ -49,6 +50,13 @@ def get_args():
"--in-file", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints",
)
parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
parser.add_argument(
"--conv-template",
type=str,
default="llama_2",
required=False,
help="Conversation template: nvgpt, llama_2, v1 (vicuna)",
)
parser.add_argument(
"--tokenizer-model", type=str, default=None, required=False, help="Path to sentencepiece tokenizer model."
)
Expand Down Expand Up @@ -121,6 +129,8 @@ def load_config(args, llava_config):
nemo_config.num_query_groups = llava_config['num_key_value_heads']
nemo_config.use_cpu_initialization = True
nemo_config.activation = 'fast-swiglu'
nemo_config.data.conv_template = args.conv_template
nemo_config.mm_cfg.model_type = args.conv_template
if args.tokenizer_model is None:
nemo_config.tokenizer.model = llava_config['tokenizer_model']
else:
Expand Down
41 changes: 41 additions & 0 deletions examples/multimodal/multimodal_llm/neva/eval/gradio_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64

import requests

# URL of the Gradio server
url = 'http://localhost:8890/api/predict/'

# Prepare the text data
text_data = '<image>Describe this image please.'

# Prepare the image data
with open("/path/to/images/001.jpg", "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()

# Data to send
data = {'data': [text_data, encoded_string]}

# Sending a POST request to the Gradio server
response = requests.post(url, json=data)

# Checking if the request was successful
if response.status_code == 200:
# Parsing the response
response_data = response.json()
print("Response from server:", response_data)
else:
print("Failed to get a response from the server, status code:", response.status_code)
Loading

0 comments on commit 44c9a52

Please sign in to comment.