Merge branch 'main' into jiemingz/first_val_step

JimmyZhang12 · Jan 23, 2024 · 44c9a52 · 44c9a52
2 parents 9da045a + a44b75d
commit 44c9a52
Show file tree

Hide file tree

Showing 108 changed files with 5,354 additions and 1,014 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -100,17 +100,6 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/torchaudio_build/scripts/installers/install_tor
   else echo "Skipping failed torchaudio installation"; fi \
   else echo "torchaudio installed successfully"; fi
 
-# install nemo dependencies
-WORKDIR /tmp/nemo
-ENV LHOTSE_REQUIRE_TORCHAUDIO=0
-COPY requirements .
-RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
-
-# install flash attention
-RUN pip install flash-attn
-# install numba for latest containers
-RUN pip install numba>=0.57.1
-
 COPY scripts /tmp/nemo/scripts/
 # install correct graphviz version (k2 and pynini visualization tool), skip if installation fails
 RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_graphviz.sh --docker); INSTALL_CODE=$?; \
@@ -133,6 +122,17 @@ RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/installers/install_k2.sh); INSTALL
   else echo "Skipping failed k2 installation"; fi \
   else echo "k2 installed successfully"; fi
 
+# install nemo dependencies
+WORKDIR /tmp/nemo
+ENV LHOTSE_REQUIRE_TORCHAUDIO=0
+COPY requirements .
+RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
+
+# install flash attention
+RUN pip install flash-attn
+# install numba for latest containers
+RUN pip install numba>=0.57.1
+
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
 COPY . .

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -3949,7 +3949,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       }
       failFast true
       steps {
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=2 \
@@ -3978,7 +3978,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.data.validation_ds.num_workers=0 \
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
         model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=1 \
@@ -4054,7 +4054,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       failFast true
       steps {
         sh "rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -4089,7 +4089,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       failFast true
       steps {
         sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -4111,7 +4111,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.data.validation_ds.num_workers=0 \
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
         model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
         model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
         model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
         model.peft.restore_from_ckpt_name=null \
@@ -4176,7 +4176,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       }
       failFast true
       steps{
-        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
             model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
             model.peft.restore_from_path=null \
             model.data.test_ds.file_names=['/home/TestData/nlp/megatron_gpt_sft/sample.jsonl'] \
@@ -4995,7 +4995,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       failFast true
       steps {
         sh "rm -rf /home/TestData/nlp/t5_lora_tuning_tp2"
-        sh "python examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -5017,7 +5017,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.data.validation_ds.num_workers=0 \
         model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
         model.data.validation_ds.names=[quarel]"
-        sh "python examples/nlp/language_modeling/tuning/megatron_t5_peft_eval.py \
+        sh "python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
         model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
         model.peft.restore_from_path=/home/TestData/nlp/t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
         model.peft.restore_from_ckpt_name=null \

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
@@ -265,8 +265,11 @@ You can easily convert your existing NeMo-compatible ASR datasets using the
     --num_shards=<number of tarfiles that will contain the audio>
     --max_duration=<float representing maximum duration of audio samples> \
     --min_duration=<float representing minimum duration of audio samples> \
+    --force_codec=flac \
     --shuffle --shuffle_seed=0
 
+.. note:: For extra reduction of storage space at the cost of lossy (but high-quality) compression, you may use ``--force_codec=opus`` instead.
+
 This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter
 audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory
 ``--target_dir`` in ``n`` shards, along with separate manifest and metadata files.

diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
@@ -73,7 +73,7 @@ via YAML or CLI:
 Experiment Loggers
 ------------------
 
-Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following
+Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, simply set the following
 via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.
 
 
@@ -153,6 +153,26 @@ ClearML
             log_cfg: False  # log config to clearml server
             log_metrics: False  # log metrics to clearml server
 
+Neptune
+~~~~~~~
+
+.. _exp_manager_neptune-label:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_checkpoint_callback: True
+        create_neptune_logger: false
+        neptune_logger_kwargs:
+            project: ${project}
+            name: ${name}
+            prefix: train
+            log_model_checkpoints: false # set to True if checkpoints need to be pushed to Neptune
+            tags: null # can specify as an array of strings in yaml array format
+            description: null
+            <Add any other arguments supported by Neptune logger here>
+
 Exponential Moving Average
 --------------------------
 

diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
@@ -10,7 +10,7 @@ Model Classes
     :members: __init__, configure_optimizers
 
 
-.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
@@ -49,7 +49,7 @@ Modules
     :show-inheritance:
     :no-members:
 
-.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
     :show-inheritance:
     :no-members:
     :members: __init__, encode, decode

diff --git a/docs/source/multimodal/mllm/checkpoint.rst b/docs/source/multimodal/mllm/checkpoint.rst
@@ -108,7 +108,7 @@ Adjust model parallelism with:
     --target_tensor_model_parallel_size=??? \
     --pipeline_model_parallel_size=??? \
     --target_pipeline_model_parallel_size=??? \
-    --model_class="nemo.collections.multimodal.models.neva.neva_model.MegatronNevaModel" \
+    --model_class="nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel" \
     --precision=32 \
     --tokenizer_model_path=/path/to/tokenizer.model \
     --tp_conversion_only
diff --git a/docs/source/multimodal/text2img/insp2p.rst b/docs/source/multimodal/text2img/insp2p.rst
@@ -6,7 +6,7 @@ Model Introduction
 
 InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
 
-Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
+Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
 
 Training Dataset
 --------------------

diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
@@ -33,7 +33,7 @@ The VAE configuration is defined under **first_stage_config**.
 .. code-block:: yaml
 
     first_stage_config:
-        _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+        _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
         from_pretrained: /path/to/vae.bin
         embed_dim: 4
         monitor: val/rec_loss

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_inference.yaml
@@ -11,6 +11,7 @@ inference:
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
   end_strings: ["<extra_id_1>","<extra_id_7>",]  # generation will stop when one of these tokens is generated
   images_base_path: /pwd/images
+  insert_image_token: null # `left` or `right` or `null`
 
 trainer:
   devices: 8
@@ -24,7 +25,7 @@ tensor_model_parallel_size: 8
 pipeline_model_parallel_size: 1
 pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)
 neva_model_file: /pwd/nemo_experiments/nemo_llava.nemo #neva_22b_tp8_finetuned_v1.nemo neva_8b_tp4_finetuned_v1.nemo
-llm_model_file: null
+base_model_file: null
 checkpoint_dir: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/checkpoints # checkpoint file dir. This is used to load the PTL checkpoint generated during the Kosmos training
 checkpoint_name: null #megatron_clip--val_loss=0.41-step=13499-consumed_samples=431904.0.ckpt # PTL checkpoint file name, only used for PTL checkpoint loading
 hparams_file: null #/pwd/nemo_multimodal/nemo_experiments/nemo_llava_finetune/version_0/hparams.yaml # model configuration file, only used for PTL checkpoint loading

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_peft.yaml
@@ -209,7 +209,7 @@ model:
 
   optim:
     name: fused_adam
-    lr: 2e-5
+    lr: 2e-4
     weight_decay: 0.
     betas:
       - 0.9

diff --git a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py b/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py
@@ -18,7 +18,8 @@
     python convert_hf_llava_to_neva.py \
      --in-file <path_to_hf_checkpoints_folder> \
      --out-file <path_to_output_nemo_file> \
-     --tokenizer-model <path_to_sp_tokenizer_model>
+     --tokenizer-model <path_to_sp_tokenizer_model> \
+     --conv-template llama_2 # nvgpt, llama_2, v1 (vicuna)
 """
 
 import os
@@ -49,6 +50,13 @@ def get_args():
         "--in-file", type=str, default=None, required=True, help="Path to Huggingface LLaMA checkpoints",
     )
     parser.add_argument("--out-file", type=str, default=None, required=True, help="Path to output .nemo file.")
+    parser.add_argument(
+        "--conv-template",
+        type=str,
+        default="llama_2",
+        required=False,
+        help="Conversation template: nvgpt, llama_2, v1 (vicuna)",
+    )
     parser.add_argument(
         "--tokenizer-model", type=str, default=None, required=False, help="Path to sentencepiece tokenizer model."
     )
@@ -121,6 +129,8 @@ def load_config(args, llava_config):
         nemo_config.num_query_groups = llava_config['num_key_value_heads']
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'fast-swiglu'
+    nemo_config.data.conv_template = args.conv_template
+    nemo_config.mm_cfg.model_type = args.conv_template
     if args.tokenizer_model is None:
         nemo_config.tokenizer.model = llava_config['tokenizer_model']
     else:

diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_cli.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_cli.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+
+import requests
+
+# URL of the Gradio server
+url = 'http://localhost:8890/api/predict/'
+
+# Prepare the text data
+text_data = '<image>Describe this image please.'
+
+# Prepare the image data
+with open("/path/to/images/001.jpg", "rb") as image_file:
+    encoded_string = base64.b64encode(image_file.read()).decode()
+
+# Data to send
+data = {'data': [text_data, encoded_string]}
+
+# Sending a POST request to the Gradio server
+response = requests.post(url, json=data)
+
+# Checking if the request was successful
+if response.status_code == 200:
+    # Parsing the response
+    response_data = response.json()
+    print("Response from server:", response_data)
+else:
+    print("Failed to get a response from the server, status code:", response.status_code)