Skip to content

Commit

Permalink
Merge branch 'main' into akoumparouli/nemo_ux_mistral_nemo
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
  • Loading branch information
akoumpa authored Sep 26, 2024
2 parents a02dbfc + 38e5e09 commit ed46ff3
Show file tree
Hide file tree
Showing 19 changed files with 1,027 additions and 86 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cherry-pick-release-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ jobs:
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":alert: Cherrypick bot 🤖: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig"
"text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <@${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
}
}
]
Expand Down
37 changes: 36 additions & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5223,6 +5223,24 @@ jobs:
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }}
L2_NeMo_2_T5_Pretraining:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
--devices=2 \
--max-steps=3 \
--experiment-dir=tests/collections/llm/t5_pretrain_results \
--data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
--index-mapping-dir=tests/collections/llm/t5_index_mappings
AFTER_SCRIPT: |
rm -rf tests/collections/llm/t5_pretrain_results
rm -rf tests/collections/llm/t5_index_mappings
Nemo_CICD_Test:
needs:
- pre-flight
Expand Down Expand Up @@ -5359,6 +5377,7 @@ jobs:
- L2_NeMo_2_GPT_DDP_Param_Parity_check
- L2_NeMo_2_SSM_Pretraining
- L2_NeMo_2_SSM_Finetuning
- L2_NeMo_2_T5_Pretraining
if: always()
runs-on: ubuntu-latest
steps:
Expand All @@ -5377,6 +5396,23 @@ jobs:
- if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
run: exit 0

- if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' }}
uses: peter-evans/create-or-update-comment@v4
with:
issue-number: ${{ github.event.number }}
body: |
[🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
I just wanted to let you know that, you know, a <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|CICD pipeline> for this PR just finished successfully ✨
So it might be time to merge this PR or like to get some approvals 🚀
But I'm just a 🤖 so I'll leave it you what to do next.
Have a great day!
//cc @ko3n1g
- if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
name: Checkout repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -5452,4 +5488,3 @@ jobs:
- if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
run: |
exit 1
28 changes: 17 additions & 11 deletions nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

class AutoTokenizer(TokenizerSpec):
"""
Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
"""

Expand All @@ -46,15 +46,14 @@ def __init__(
use_fast: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
):

"""
Args:
pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained.
pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained.
The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
vocab_file: path to file with vocabulary which consists
of characters separated by newlines.
mask_token: mask token
mask_token: mask token
bos_token: the beginning of sequence token
eos_token: the end of sequence token. Usually equal to sep_token
pad_token: token to use for padding
Expand Down Expand Up @@ -132,24 +131,24 @@ def __init__(

if len(new_tokens_in_vocab) > 0:
"""
Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
the vocabulary and, as a result, the model should be resized, for example:
# define your model
pretrained_model_name = 'roberta-base'
model = nemo_nlp.modules.get_lm_model(pretrained_model_name=pretrained_model_name)
# define pretrained tokenizer
tokenizer_default = nemo_nlp.modules.get_tokenizer(tokenizer_name=pretrained_model_name)
special_tokens = {'bos_token': '<BOS>',
'cls_token': '<CSL>',
'additional_special_tokens': ['<MY_NER_TOKEN>', '<ANOTHER_TOKEN>']}
tokenizer_default.add_special_tokens(special_tokens_dict=special_tokens)
# resize your model so that the embeddings for newly added tokens are updated during training/finetuning
model.resize_token_embeddings(tokenizer_default.vocab_size)
See NLP_Tokenizers.ipynb for more details.
"""
logging.warning(
Expand All @@ -159,6 +158,7 @@ def __init__(
)
self.add_special_tokens(special_tokens_dict)
self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
self._inv_vocab_dict = {}

@property
def vocab_size(self):
Expand Down Expand Up @@ -226,6 +226,12 @@ def vocab(self):
id2vocab = {v: k for k, v in self.tokenizer.vocab.items()}
return [id2vocab[i] for i in range(len(id2vocab))]

@property
def inv_vocab(self):
if self._inv_vocab_dict == {}:
self._inv_vocab_dict = {v: k for k, v in self.tokenizer.vocab.items()}
return self._inv_vocab_dict

@property
def pad_id(self):
if getattr(self, 'pad_token') is None:
Expand Down
5 changes: 5 additions & 0 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,18 @@
gpt_data_step,
gpt_forward_step,
)
from nemo.collections.llm.t5.model import T5Config, T5Model, t5_data_step, t5_forward_step

__all__ = [
"MockDataModule",
"GPTModel",
"GPTConfig",
"gpt_data_step",
"gpt_forward_step",
"T5Model",
"T5Config",
"t5_data_step",
"t5_forward_step",
"MaskedTokenLossReduction",
"MistralConfig7B",
"MistralNeMoConfig12B",
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/peft/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def forward(self, x):
linear_output, bias, layernorm_output = linear_output
x = layernorm_output

adapter_output = self.adapter(x)
adapter_output = self.adapter(x.contiguous())
return linear_output + adapter_output, bias


Expand Down
Empty file.
3 changes: 3 additions & 0 deletions nemo/collections/llm/t5/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from nemo.collections.llm.t5.data.pre_training import PreTrainingDataModule

__all__ = ["PreTrainingDataModule"]
Loading

0 comments on commit ed46ff3

Please sign in to comment.