Merge branch 'NVIDIA:main' into main

JimmyZhang12 · Feb 2, 2024 · a0d1685 · a0d1685
2 parents 788ed1b + 5fdd12e
commit a0d1685
Show file tree

Hide file tree

Showing 19 changed files with 1,212 additions and 15 deletions.
diff --git a/README.rst b/README.rst
@@ -101,7 +101,7 @@ Key Features
  * Hybrid Transducer/CTC
  * NeMo Original `Multi-blank Transducers <https://arxiv.org/abs/2211.03541>`_ and `Token-and-Duration Transducers (TDT) <https://arxiv.org/abs/2304.06795>`_
  * Streaming/Buffered ASR (CTC/Transducer) - `Chunked Inference Examples <https://github.com/NVIDIA/NeMo/tree/stable/examples/asr/asr_chunked_inference>`_
- * `Cache-aware Streaming Conformer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer>`_ with multiple lookaheads.
+ * `Cache-aware Streaming Conformer <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer>`_ with multiple lookaheads (including microphone streaming `tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb>`_).
  * Beam Search decoding
  * `Language Modelling for ASR (CTC and RNNT) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/asr_language_modeling.html>`_: N-gram LM in fusion with Beam Search decoding, Neural Rescoring with Transformer
  * `Support of long audios for Conformer with memory efficient local attention <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html#inference-on-long-audio>`_

diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
@@ -108,9 +108,7 @@ See more information about LM decoding :doc:`here <./asr_language_modeling>`.
 Use real-time transcription
 ---------------------------
 
-It is possible to use NeMo to transcribe speech in real-time. You can find an example of how to do 
-this in the following `notebook tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo.ipynb>`_.
-
+It is possible to use NeMo to transcribe speech in real-time. We provide tutorial notebooks for `Cache Aware Streaming <https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb>`_ and `Buffered Streaming <https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Buffered_Streaming.ipynb>`_.
 
 Try different ASR models
 ------------------------

diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
@@ -159,6 +159,8 @@ You may find more examples under ``<NeMo_git_root>/examples/asr/conf/fastconform
 Cache-aware Streaming Conformer
 -------------------------------
 
+Try real-time ASR with the Cache-aware Streaming Conformer `tutorial notebook <https://github.com/NVIDIA/NeMo/blob/main/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb>`_.
+
 Buffered streaming uses overlapping chunks to make an offline ASR model to be used for streaming with reasonable accuracy. However, it uses significant amount of duplication in computations due to the overlapping chunks.
 Also there is a accuracy gap between the offline model and the streaming one as there is inconsistency between how we train the model and how we perform inference for streaming.
 The Cache-aware Streaming Conformer models would tackle and address these disadvantages. These streaming Conformers are trained with limited right context that it would make it possible to match how the model is being used in both the training and inference.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -64,6 +64,8 @@
  'PIL',
  'boto3',
  'taming',
+ 'cytoolz', # for adapters
+ 'megatron', # for nlp
 ]
 
 _skipped_autodoc_mock_imports = ['wrapt', 'numpy']

diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
@@ -47,8 +47,11 @@ To run a tutorial:
  - Offline ASR Inference with Beam Search and External Language Model Rescoring
  - `Offline ASR <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Offline_ASR.ipynb>`_
  * - ASR
- - Online ASR inference with Microphone
- - `Online ASR Microphone <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_ASR_Microphone_Demo.ipynb>`_
+ - Online ASR inference with Microphone (Cache-Aware Streaming)
+ - `Online ASR Microphone Cache Aware Streaming <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb>`_
+ * - ASR
+ - Online ASR inference with Microphone (Buffered Streaming)
+ - `Online ASR Microphone Buffered Streaming <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_ASR_Microphone_Demo_Buffered_Streaming.ipynb>`_
  * - ASR
  - Fine-tuning CTC Models on New Languages
  - `ASR CTC Language Fine-Tuning <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb>`_

diff --git a/docs/update_docs_docker.sh b/docs/update_docs_docker.sh
@@ -1,5 +1,5 @@
 cd ../
-docker run --rm -v $PWD:/workspace python:3.8 /bin/bash -c "cd /workspace && \
+docker run --rm -v $PWD:/workspace python:3.10 /bin/bash -c "cd /workspace && \
 pip install -r requirements/requirements_docs.txt && cd docs/ && rm -rf build && make clean && make html && make html"
 echo "To start web server just run in docs directory:"
 echo "python3 -m http.server 8000 --directory ./build/html/"
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -2,7 +2,7 @@ name: megatron_bert
 restore_from_path: null # used when starting from a .nemo file
 
 trainer:
- devices: 2
+ devices: 1
  num_nodes: 1
  accelerator: gpu
  precision: 16
@@ -56,15 +56,19 @@ model:
  hidden_size: 768
  ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
  num_attention_heads: 12
+ skip_head: False
+ transformer_block_type: post_ln
  init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
  kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
+ normalization: layernorm
  layernorm_epsilon: 1e-5
  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
  pre_process: True # add embedding
  post_process: True # add pooler
  bert_binary_head: True # BERT binary head
+ megatron_legacy: False
 
  tokenizer:
  library: 'megatron'
@@ -128,7 +132,7 @@ model:
  # - /raid/data/pile/my-gpt3_00_text_document
  # - .5
  # - /raid/data/pile/my-gpt3_01_text_document
- data_prefix: ???
+ data_prefix: [1.0, /path/to/data]
  index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
  data_impl: mmap
  splits_string: 900,50,50

diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
@@ -786,6 +786,8 @@ def set_default_att_context_size(self, att_context_size):
  if att_context_size is not None:
  self.att_context_size = att_context_size
 
+ self.setup_streaming_params()
+
  def setup_streaming_params(
  self,
  chunk_size: int = None,

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert_model.py
@@ -65,6 +65,9 @@ def bert_extended_attention_mask(attention_mask):
  # [b, 1, s, s]
  extended_attention_mask = attention_mask_bss.unsqueeze(1)
 
+ # HF Masking is equivalent to the one below
+ # extended_attention_mask = (attention_mask.unsqueeze(1) * torch.ones_like(attention_mask).unsqueeze(2)).unsqueeze(1)
+
  # Convert attention mask to binary:
  extended_attention_mask = extended_attention_mask < 0.5
 
@@ -182,12 +185,15 @@ def __init__(
  activations_checkpoint_num_layers=1,
  activations_checkpoint_layers_per_pipeline=None,
  layernorm_epsilon=1e-5,
+ normalization='layernorm',
+ transformer_block_type='pre_ln',
  masked_softmax_fusion=False,
  bias_gelu_fusion=True,
  bias_dropout_add_fusion=True,
  openai_gelu=False,
  onnx_safe=False,
  add_binary_head=True,
+ skip_head=False,
  megatron_legacy=False,
  sequence_parallel=False,
  position_embedding_type='learned_absolute',
@@ -229,6 +235,8 @@ def __init__(
  activations_checkpoint_num_layers=activations_checkpoint_num_layers,
  activations_checkpoint_layers_per_pipeline=activations_checkpoint_layers_per_pipeline,
  layernorm_epsilon=layernorm_epsilon,
+ normalization=normalization,
+ transformer_block_type=transformer_block_type,
  masked_softmax_fusion=masked_softmax_fusion,
  bias_activation_fusion=bias_gelu_fusion,
  bias_dropout_add_fusion=bias_dropout_add_fusion,
@@ -242,6 +250,8 @@ def __init__(
  init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size
  )
 
+ if skip_head:
+ self.post_process = False
  if self.post_process:
  self.lm_head = BertLMHead(
  config,

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -184,10 +184,13 @@ def model_provider_func(self, pre_process, post_process):
  ),
  layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
  masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
+ normalization=cfg.get('normalization', 'layernorm'),
+ transformer_block_type=cfg.get('transformer_block_type', 'pre_ln'),
  bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
  bias_dropout_add_fusion=cfg.get("bias_dropout_add_fusion", True),
  onnx_safe=cfg.get('onnx_safe', False),
  add_binary_head=cfg.bert_binary_head,
+ skip_head=cfg.get('skip_head', False),
  megatron_legacy=cfg.get('megatron_legacy', False),
  position_embedding_type=self.cfg.get("position_embedding_type", "learned_absolute"),
  )
@@ -1034,5 +1037,65 @@ def build_transformer_config(self) -> TransformerConfig:
  """
  activation = self.cfg.get('activation', 'gelu')
  assert activation == 'gelu', "Only gelu activation is support for BERT at the moment."
+
+ normalization = self.cfg.get('normalization', 'layernorm')
+
+ layernorm_zero_centered_gamma = self.cfg.get('normalization', 'layernorm') == 'layernorm1p'
+ if normalization == 'layernorm':
+ normalization = 'LayerNorm'
+ elif normalization == 'rmsnorm':
+ normalization = 'RMSNorm'
+ elif normalization == 'layernorm1p':
+ normalization = 'LayerNorm'
+ layernorm_zero_centered_gamma = True
+ else:
+ logging.warning(
+ f"The normalization type: {normalization} might not be supported in megatron core."
+ f"Supported types are LayerNorm and RMSNorm."
+ )
+
+ # any configs that are not in the nemo model config will be added here
+ model_specific_configs = {
+ 'layernorm_zero_centered_gamma': layernorm_zero_centered_gamma,
+ 'normalization': normalization,
+ }
+
  transformer_config = super().build_transformer_config()
+
+ for key, value in model_specific_configs.items():
+ setattr(transformer_config, key, value)
+
+ # pass mcore customization configs directly to mcore
+ mcore_customization_config_dict = self.cfg.get('mcore_customization_config', {})
+ for key, value in mcore_customization_config_dict.items():
+ setattr(transformer_config, key, value)
+
  return transformer_config
+
+
+class MegatronBertTextEmbeddingModel(MegatronBertModel):
+ """
+ Megatron Bert Text Embedding.
+ Model returns [batch, hidden] shape
+ """
+
+ def average_pool(self, last_hidden_states, attention_mask):
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+ def forward(
+ self,
+ input_ids,
+ attention_mask,
+ token_type_ids,
+ lm_labels=None,
+ checkpoint_activations_all_layers=None,
+ model=None,
+ ):
+ outputs = super().forward(
+ input_ids, attention_mask, token_type_ids, lm_labels, checkpoint_activations_all_layers, model
+ )
+ embeddings = self.average_pool(outputs[0], attention_mask)
+ embeddings = F.normalize(embeddings, p=2, dim=1)
+
+ return embeddings
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -625,7 +625,6 @@ def forward(
  )
 
  output = bias_dropout_add_func(mlp_output, mlp_bias, residual, self.hidden_dropout)
- # print(f"Layer: {self.layer_number} MLP + Dropout + Residual checksum {output.sum()}")
 
  if self.transformer_block_type == 'post_ln':
  output = self.post_attention_layernorm(output)
@@ -1158,6 +1157,27 @@ def build_layer(layer_number):
  offset = parallel_state.get_pipeline_model_parallel_rank() * self.num_layers
 
  self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
+ if self.pre_process and self.transformer_block_type == 'post_ln':
+ # Final layer norm before output.
+ if normalization == 'layernorm':
+ self.initial_layernorm = get_layer_norm(
+ hidden_size, layernorm_epsilon, persist_layer_norm, sequence_parallel=config.sequence_parallel
+ )
+
+ elif normalization == 'layernorm1p':
+ self.initial_layernorm = LayerNorm1P(
+ hidden_size, layernorm_epsilon, sequence_parallel_enabled=config.sequence_parallel
+ )
+ elif normalization == 'low_precision_layernorm':
+ self.initial_layernorm = LPLayerNorm(hidden_size, layernorm_epsilon)
+ else:
+ self.initial_layernorm = MixedFusedRMSNorm(hidden_size, layernorm_epsilon)
+ # for architectures such as MPT, there is no bias term even on the layernorms
+ # this code allows us to remove the bias terms from the layernorm module
+ # so that we can support MPT. However, certain apex-based LNs don't support
+ # removing bias, so we also have to check for that
+ if not bias and normalization not in ['layernorm', 'layernorm1p']:
+ remove_bias_from_layernorm(self.initial_layernorm)
 
  if self.post_process and self.transformer_block_type != 'post_ln':
  # Final layer norm before output.
@@ -1435,7 +1455,10 @@ def forward(
  'get_key_value does not work with ' 'activation checkpointing'
  )
 
- if not self.pre_process:
+ if self.pre_process:
+ if self.transformer_block_type == 'post_ln':
+ hidden_states = self.initial_layernorm(hidden_states)
+ else:
  # See set_input_tensor()
  hidden_states = self.input_tensor
 

diff --git a/nemo/core/neural_types/elements.py b/nemo/core/neural_types/elements.py
@@ -96,6 +96,10 @@ def fields(self):
  return None
 
  def compare(self, second) -> NeuralTypeComparisonResult:
+ if torch.jit.is_scripting():
+ # Neural types for TorchScript are suppressed
+ # This is a stub to make TorchScript happy
+ return NeuralTypeComparisonResult.SAME
  # First, check general compatibility
  first_t = type(self)
  second_t = type(second)

diff --git a/nemo/core/neural_types/neural_type.py b/nemo/core/neural_types/neural_type.py
@@ -51,8 +51,21 @@ def __str__(self):
  else:
  return f"axes: None; elements_type: {self.elements_type.__class__.__name__}"
 
+ def __init__(self, axes: Optional[Any] = None, elements_type: Optional[Any] = None, optional: bool = False):
+ """
+ Args:
+ axes: a tuple of AxisTypes objects representing the semantics of what varying each axis means
+ elements_type: None or ElementType; we need Any annotation here to avoid problems with TorchScript (it is checked in _init_internal)
+ optional: If input to the port of this type can be optional (False by default).
+ """
+ if not torch.jit.is_scripting():
+ self._init_internal(axes=axes, elements_type=elements_type, optional=optional)
+
  @torch.jit.unused
- def __init__(self, axes: Optional[Any] = None, elements_type: Optional[ElementType] = None, optional=False):
+ def _init_internal(
+ self, axes: Optional[Any] = None, elements_type: Optional[ElementType] = None, optional: bool = False
+ ):
+ """Internals of __init__, separated to make TorchScript and autodoc work"""
  if elements_type is None:
  elements_type = VoidType()
  if not isinstance(elements_type, ElementType):
@@ -62,8 +75,7 @@ def __init__(self, axes: Optional[Any] = None, elements_type: Optional[ElementTy
  )
  self.elements_type = elements_type
  if axes is not None:
- if not torch.jit.is_scripting():
- NeuralType.__check_sanity(axes)
+ NeuralType.__check_sanity(axes)
  axes_list = []
  for axis in axes:
  if isinstance(axis, str):

diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
@@ -5,7 +5,7 @@ ipywidgets
 jiwer
 kaldi-python-io
 kaldiio
-lhotse>=1.19.2
+lhotse==1.19.2
 librosa>=0.10.0
 marshmallow
 matplotlib