From fb4cbdc277a0d345e0343b0f7bb63176aeb41cf9 Mon Sep 17 00:00:00 2001 From: Raul Puri Date: Wed, 27 Mar 2019 13:55:24 -0700 Subject: [PATCH] initial commit --- LICENSE | 204 +++ README.md | 86 ++ arguments.py | 283 ++++ configure_data.py | 229 +++ data_utils/__init__.py | 115 ++ data_utils/corpora.py | 37 + data_utils/datasets.py | 676 +++++++++ data_utils/file_utils.py | 253 ++++ data_utils/lazy_loader.py | 195 +++ data_utils/samplers.py | 87 ++ data_utils/tf_dl.py | 87 ++ data_utils/tokenization.py | 788 ++++++++++ data_utils/wordpiece.py | 390 +++++ fp16/__init__.py | 30 + fp16/fp16.py | 627 ++++++++ fp16/fp16util.py | 200 +++ fp16/loss_scaler.py | 223 +++ learning_rates.py | 74 + model/__init__.py | 17 + model/distributed.py | 108 ++ model/model.py | 88 ++ model/modeling.py | 1314 +++++++++++++++++ optim/__init__.py | 26 + optim/adam.py | 124 ++ pretrain_bert.py | 490 ++++++ requirements.txt | 5 + scripts/pretrain_bert.sh | 37 + scripts/pretrain_bert_distributed.sh | 44 + scripts/pretrain_bert_sentencepiece.sh | 37 + .../pretrain_bert_tfrecords_distributed.sh | 43 + utils.py | 180 +++ 31 files changed, 7097 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 arguments.py create mode 100644 configure_data.py create mode 100644 data_utils/__init__.py create mode 100755 data_utils/corpora.py create mode 100644 data_utils/datasets.py create mode 100755 data_utils/file_utils.py create mode 100644 data_utils/lazy_loader.py create mode 100644 data_utils/samplers.py create mode 100755 data_utils/tf_dl.py create mode 100755 data_utils/tokenization.py create mode 100755 data_utils/wordpiece.py create mode 100644 fp16/__init__.py create mode 100755 fp16/fp16.py create mode 100644 fp16/fp16util.py create mode 100755 fp16/loss_scaler.py create mode 100644 learning_rates.py create mode 100755 model/__init__.py create mode 100755 model/distributed.py create mode 100755 model/model.py create mode 100644 model/modeling.py create mode 100755 optim/__init__.py create mode 100755 optim/adam.py create mode 100755 pretrain_bert.py create mode 100644 requirements.txt create mode 100755 scripts/pretrain_bert.sh create mode 100755 scripts/pretrain_bert_distributed.sh create mode 100755 scripts/pretrain_bert_sentencepiece.sh create mode 100755 scripts/pretrain_bert_tfrecords_distributed.sh create mode 100644 utils.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cb87378 --- /dev/null +++ b/LICENSE @@ -0,0 +1,204 @@ +------------- LICENSE FOR huggingface(transformer) repository -------------- + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0804d67 --- /dev/null +++ b/README.md @@ -0,0 +1,86 @@ +Megatron is a large, powerful transformer. This repo is for ongoing research on training large, powerful transformer language models at scale. Currently, we support multinode training of [BERT](https://arxiv.org/pdf/1810.04805.pdf) in mixed precision. Our codebase is capable of training BERT Large on 64 V100 GPUs in 3 days. We achieved a final language modeling perplexity of 3.15 and SQuAD F1-score of 90.7. + +# Setup +We officially support only python3.6. + +To use this repo please install the latest supported versions of PyTorch with GPU support. + +Additionally, part of this codebase leverages tensorflow-cpu to perform dataloading of TFRecords. We recommend creating a virtual environment (to avoid breaking existing tf installations) and install our `reuirements.txt`. + +``` +python -m pip install virtualenv +virtualenv bert_env +source bert_env/bin/activate +pip install -r requirements.txt +``` + + +# Usage +We've provided 4 scripts that pretrain BERT. All saved checkpoints can be used for finetuning according to [existing implementations](https://github.com/huggingface). Save model checkpoints with `--save`. + +## BERT Pretraining +`bash scripts/pretrain_bert.sh` + +This script runs single gpu BERT pretraining and is mainly for debugging purposes. + +To use this script place your `--train-data` in loose json format with one json per line. The text field of your json dictionaries should correspond to `--text-key`. + +``` +python pretrain_bert.py \ + --batch-size 4 \ + --tokenizer-type BertWordPieceTokenizer \ + --cache-dir temp_cache_dir \ + --tokenizer-model-type bert-large-uncased \ + --vocab-size 30522 \ + --train-data wikipedia \ + --loose-json \ + --text-key text \ + --split 1000,1,1 \ + --lazy-loader \ + --max-preds-per-seq 80 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --num-layers 24 \ + --hidden-size 1024 \ + --intermediate-size 4096 \ + --num-attention-heads 16 \ + --hidden-dropout 0.1 \ + --attention-dropout 0.1 \ + --train-iters 1000000 \ + --lr 0.0001 \ + --lr-decay-style linear \ + --lr-decay-iters 990000 \ + --warmup .01 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --fp16 \ + --fp32-layernorm \ + --fp32-embedding \ + --hysteresis 2 \ + --num-workers 2 +``` + +## Distributed BERT Pretraining +`bash scripts/pretrain_bert_distributed.sh` + +To use this script, follow the same data preparation procedure as in [earlier sections](#bert-pretraining). This script uses the pytorch distributed launcher to launch distributed training. As such, multinode training can be achieved by properly setting environment variables for the `env://` init method. See the official pytorch [documentation](https://pytorch.org/docs/stable/distributed.html#launch-utility) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default multinode training uses the nccl distributed backend. + +## Distributed BERT Pretraining with TFRecords +`bash scripts/pretrain_bert_tfrecords_distributed.sh` + +This script takes advantage of TensorFlow BERT's [`create_pretraining.py`](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/create_pretraining_data.py) script to pre-cache the dataset in the TFRecord format. To convert the data to pytorch tensors we use a `TFRecordDataset` and tensorflow eager mode to turn the TFRecords into numpy matrices before loading them into pytorch gpu tensors. This greatly reduces the overhead of dataprocessing and speeds up training. Pass a whitespace-separated list of TFRecord paths to `--train-data` and enable the `--use-tfrecords` flag. Multinode training can be achieved as described in the [previous section](#distributed-bert-pretraining). + +## Train Custom Sentence Piece Tokenizer and Pretrain BERT +`bash scripts/pretrain_bert_sentencepiece.sh` + +This script runs BERT pretraining with a `sentencepiece` tokenizer. If no sentencepiece tokenizer exists at `--tokenizer-path` one will be trained automatically. The sentencepiece tokenizer can be used with the previous scripts (NOTE: sentencepiece training can only happen during single gpu pretraining). `<--tokenizer-path>.vocab` can be used with [`create_pretraining_data.py`](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/create_pretraining_data.py) to make a TFRecord dataset with the given tokenization. + + +# Collecting Wikipedia Training Data +We recommend following the wikipedia data extraction process specified by google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text." + +We recommend using the `--json` argument when using WikiExtractor, which will dump the wikipedia data into loose json format (one json per line), making it more manageable and readily consumable by our codebase. + +Once the json dataset is ready make sure to set the path in line 27 of `data_utils/corpora.py`. + +If your system is memory limited we also recommend running pretraining with the `--lazy-loader` argument as we've done. After preprocessing the dataset once, this will allow the dataset to be lazily loaded from disk, as opposed to storing it in memory. diff --git a/arguments.py b/arguments.py new file mode 100644 index 0000000..d7d554e --- /dev/null +++ b/arguments.py @@ -0,0 +1,283 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""argparser configuration""" + +import argparse +import os +import torch + + +def add_model_config_args(parser): + """Model arguments""" + + group = parser.add_argument_group('model', 'model configuration') + + group.add_argument('--pretrained-bert', action='store_true', + help='use a pretrained bert-large-uncased model instead' + 'of initializing from scratch. See ' + '--tokenizer-model-type to specify which pretrained ' + 'BERT model to use') + group.add_argument('--attention-dropout', type=float, default=0.1, + help='dropout probability for attention weights') + group.add_argument('--num-attention-heads', type=int, default=16, + help='num of transformer attention heads') + group.add_argument('--hidden-size', type=int, default=1024, + help='tansformer hidden size') + group.add_argument('--intermediate-size', type=int, default=None, + help='transformer embedding dimension for FFN' + 'set to 4*`--hidden-size` if it is None') + group.add_argument('--num-layers', type=int, default=24, + help='num decoder layers') + group.add_argument('--layernorm-epsilon', type=float, default=1e-12, + help='layer norm epsilon') + group.add_argument('--hidden-dropout', type=float, default=0.0, + help='dropout probability for hidden state transformer') + group.add_argument('--max-position-embeddings', type=int, default=512, + help='maximum number of position embeddings to use') + group.add_argument('--vocab-size', type=int, default=30522, + help='vocab size to use for non-character-level ' + 'tokenization. This value will only be used when ' + 'creating a tokenizer') + + return parser + + +def add_fp16_config_args(parser): + """Mixed precision arguments.""" + + group = parser.add_argument_group('fp16', 'fp16 configurations') + + group.add_argument('--fp16', action='store_true', + help='Run model in fp16 mode') + group.add_argument('--fp32-embedding', action='store_true', + help='embedding in fp32') + group.add_argument('--fp32-layernorm', action='store_true', + help='layer norm in fp32') + group.add_argument('--fp32-tokentypes', action='store_true', + help='embedding token types in fp32') + group.add_argument('--fp32-allreduce', action='store_true', + help='all-reduce in fp32') + group.add_argument('--hysteresis', type=int, default=2, + help='hysteresis for dynamic loss scaling') + group.add_argument('--loss-scale', type=float, default=None, + help='Static loss scaling, positive power of 2 ' + 'values can improve fp16 convergence. If None, dynamic' + 'loss scaling is used.') + group.add_argument('--loss-scale-window', type=float, default=1000, + help='Window over which to raise/lower dynamic scale') + group.add_argument('--min-scale', type=float, default=1, + help='Minimum loss scale for dynamic loss scale') + + return parser + + +def add_training_args(parser): + """Training arguments.""" + + group = parser.add_argument_group('train', 'training configurations') + + group.add_argument('--batch-size', type=int, default=4, + help='Data Loader batch size') + group.add_argument('--weight-decay', type=float, default=0.01, + help='weight decay coefficient for L2 regularization') + group.add_argument('--checkpoint-activations', action='store_true', + help='checkpoint activation to allow for training ' + 'with larger models and sequences') + group.add_argument('--clip-grad', type=float, default=1.0, + help='gradient clipping') + group.add_argument('--epochs', type=int, default=1, + help='upper epoch limit') + group.add_argument('--log-interval', type=int, default=100, + help='report interval') + group.add_argument('--train-iters', type=int, default=1000000, + help='number of iterations per epoch') + group.add_argument('--seed', type=int, default=1234, + help='random seed') + # Learning rate. + group.add_argument('--lr-decay-iters', type=int, default=None, + help='number of iterations to decay LR over,' + ' If None defaults to `--train-iters`*`--epochs`') + group.add_argument('--lr-decay-style', type=str, default='linear', + choices=['constant', 'linear', 'cosine', 'exponential'], + help='learning rate decay function') + group.add_argument('--lr', type=float, default=1.0e-4, + help='initial learning rate') + group.add_argument('--warmup', type=float, default=0.01, + help='percentage of data to warmup on (.01 = 1% of all ' + 'training iters). Default 0.01') + # model checkpointing + group.add_argument('--save', type=str, default=None, + help='Output directory to save checkpoints to.') + group.add_argument('--save-iters', type=int, default=None, + help='Save every so often iterations.') + group.add_argument('--save-optim', action='store_true', + help='Save current optimizer.') + group.add_argument('--save-rng', action='store_true', + help='Save current rng state.') + group.add_argument('--save-all-rng', action='store_true', + help='Save current rng state of each rank in ' + 'distributed training.') + group.add_argument('--load', type=str, default=None, + help='Path to a particular model checkpoint. \ + (ex. `savedir/model.1000.pt`)') + group.add_argument('--load-optim', action='store_true', + help='Load most recent optimizer corresponding ' + 'to `--load`.') + group.add_argument('--load-rng', action='store_true', + help='Load most recent rng state corresponding ' + 'to `--load`.') + group.add_argument('--load-all-rng', action='store_true', + help='Load most recent rng state of each rank in ' + 'distributed training corresponding to `--load`(' + 'complementary to `--save-all-rng`).') + group.add_argument('--resume-dataloader', action='store_true', + help='Resume the dataloader when resuming training. ' + 'Does not apply to tfrecords dataloader, try resuming' + 'with a different seed in this case.') + # distributed training args + group.add_argument('--distributed-backend', default='nccl', + help='which backend to use for distributed ' + 'training. One of [gloo, nccl]') + group.add_argument('--local_rank', type=int, default=None, + help='local rank passed from distributed launcher') + + return parser + + +def add_evaluation_args(parser): + """Evaluation arguments.""" + + group = parser.add_argument_group('validation', 'validation configurations') + + group.add_argument('--eval-batch-size', type=int, default=None, + help='Data Loader batch size for evaluation datasets.' + 'Defaults to `--batch-size`') + group.add_argument('--eval-iters', type=int, default=2000, + help='number of iterations per epoch to run ' + 'validation/test for') + group.add_argument('--eval-seq-length', type=int, default=None, + help='Maximum sequence length to process for ' + 'evaluation. Defaults to `--seq-length`') + group.add_argument('--eval-max-preds-per-seq', type=int, default=None, + help='Maximum number of predictions to use for ' + 'evaluation. Defaults to ' + 'math.ceil(`--eval-seq-length`*.15/10)*10') + + return parser + + +def add_data_args(parser): + """Train/valid/test data arguments.""" + + group = parser.add_argument_group('data', 'data configurations') + + group.add_argument('--train-data', nargs='+', required=True, + help='Filename (or whitespace separated filenames) ' + 'for training.') + group.add_argument('--delim', default=',', + help='delimiter used to parse csv data files') + group.add_argument('--text-key', default='sentence', + help='key to use to extract text from json/csv') + group.add_argument('--eval-text-key', default=None, + help='key to use to extract text from ' + 'json/csv evaluation datasets') + group.add_argument('--valid-data', nargs='*', default=None, + help="""Filename for validation data.""") + group.add_argument('--split', default='1000,1,1', + help='comma-separated list of proportions for training,' + ' validation, and test split') + group.add_argument('--test-data', nargs='*', default=None, + help="""Filename for testing""") + + group.add_argument('--lazy-loader', action='store_true', + help='whether to lazy read the data set') + group.add_argument('--loose-json', action='store_true', + help='Use loose json (one json-formatted string per ' + 'newline), instead of tight json (data file is one ' + 'json string)') + group.add_argument('--num-workers', type=int, default=2, + help="""Number of workers to use for dataloading""") + group.add_argument('--tokenizer-model-type', type=str, + default='bert-large-uncased', + help="Model type to use for sentencepiece tokenization \ + (one of ['bpe', 'char', 'unigram', 'word']) or \ + bert vocab to use for BertWordPieceTokenizer (one of \ + ['bert-large-uncased', 'bert-large-cased', etc.])") + group.add_argument('--tokenizer-path', type=str, default='tokenizer.model', + help='path used to save/load sentencepiece tokenization ' + 'models') + group.add_argument('--tokenizer-type', type=str, + default='BertWordPieceTokenizer', + choices=['CharacterLevelTokenizer', + 'SentencePieceTokenizer', + 'BertWordPieceTokenizer'], + help='what type of tokenizer to use') + group.add_argument("--cache-dir", default=None, type=str, + help="Where to store pre-trained BERT downloads") + group.add_argument('--use-tfrecords', action='store_true', + help='load `--train-data`, `--valid-data`, ' + '`--test-data` from BERT tf records instead of ' + 'normal data pipeline') + group.add_argument('--seq-length', type=int, default=512, + help="Maximum sequence length to process") + group.add_argument('--max-preds-per-seq', type=int, default=None, + help='Maximum number of predictions to use per sequence.' + 'Defaults to math.ceil(`--seq-length`*.15/10)*10.' + 'MUST BE SPECIFIED IF `--use-tfrecords` is True.') + + return parser + + +def print_args(args): + """Print arguments.""" + + print('arguments:', flush=True) + for arg in vars(args): + dots = '.' * (29 - len(arg)) + print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True) + + +def get_args(): + """Parse all the args.""" + + parser = argparse.ArgumentParser(description='PyTorch BERT Model') + parser = add_model_config_args(parser) + parser = add_fp16_config_args(parser) + parser = add_training_args(parser) + parser = add_evaluation_args(parser) + parser = add_data_args(parser) + + args = parser.parse_args() + + args.cuda = torch.cuda.is_available() + args.rank = int(os.getenv('RANK', '0')) + args.world_size = int(os.getenv("WORLD_SIZE", '1')) + + args.dynamic_loss_scale = False + if args.loss_scale is None: + args.dynamic_loss_scale = True + print(' > using dynamic loss scaling') + + # The args fp32_* or fp16_* meant to be active when the + # args fp16 is set. So the default behaviour should all + # be false. + if not args.fp16: + args.fp32_embedding = False + args.fp32_tokentypes = False + args.fp32_layernorm = False + + print_args(args) + return args diff --git a/configure_data.py b/configure_data.py new file mode 100644 index 0000000..fa1dd92 --- /dev/null +++ b/configure_data.py @@ -0,0 +1,229 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""parses arguments and preps data loader""" + +import copy +import torch +import data_utils + + +class DataConfig: + + def __init__(self, defaults={}): + super(DataConfig, self).__init__() + self.defaults = defaults + + def apply(self, args): + print('configuring data') + self.apply_defaults(args) + return make_loaders(args) + + def set_defaults(self, **kwargs): + for k, v in kwargs.items(): + self.defaults[k] = v + + def apply_defaults(self, args): + for k, v in self.defaults.items(): + k = k.replace('-', '_') + if not hasattr(args, k): + setattr(args, k, v) + + +def make_data_loader(dataset, batch_size, args): + + shuffle = args.shuffle + if shuffle: + sampler = torch.utils.data.RandomSampler(dataset) + else: + sampler = torch.utils.data.SequentialSampler(dataset) + world_size = args.world_size + rank = args.rank + distributed = world_size > 1 + drop_last = distributed + + if distributed: + batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler, + batch_size, + drop_last, + rank, + world_size) + else: + batch_sampler = torch.utils.data.BatchSampler(sampler, + batch_size, + drop_last) + + data_loader = torch.utils.data.DataLoader(dataset, + batch_sampler=batch_sampler, + num_workers=args.num_workers, + pin_memory=True) + + return data_loader + + +def make_tfrecord_loaders(args): + """Load train/val/test dataset from shuffled TFRecords""" + + import data_utils.tf_dl + data_set_args = {'batch_size': args.batch_size, + 'max_seq_len': args.seq_length, + 'max_preds_per_seq': args.max_preds_per_seq, + 'train': True, + 'num_workers': args.num_workers, + 'seed': args.seed+args.rank+1} + train = data_utils.tf_dl.TFRecordDataLoader(args.train_data, + **data_set_args) + data_set_args['train'] = False + if args.eval_seq_length is not None: + data_set_args['max_seq_len'] = args.eval_seq_length + if args.eval_max_preds_per_seq is not None: + data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq + valid = None + if args.valid_data is not None: + valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data, + **data_set_args) + test = None + if args.test_data is not None: + test = data_utils.tf_dl.TFRecordDataLoader(args.test_data, + **data_set_args) + tokenizer = data_utils.make_tokenizer(args.tokenizer_type, + train, + args.tokenizer_path, + args.vocab_size, + args.tokenizer_model_type, + cache_dir=args.cache_dir) + + return (train, valid, test), tokenizer + + +def make_loaders(args): + """makes training/val/test""" + + if args.use_tfrecords: + return make_tfrecord_loaders(args) + batch_size = args.batch_size * args.world_size + eval_batch_size = batch_size + if args.eval_batch_size is not None: + eval_batch_size = args.eval_batch_size * args.world_size + seq_length = args.seq_length + if seq_length < 0: + seq_length = seq_length * args.world_size + eval_seq_length = args.eval_seq_length + if eval_seq_length is not None and eval_seq_length < 0: + eval_seq_length = eval_seq_length * args.world_size + split = get_split(args) + data_set_args = { + 'path': args.train_data, + 'seq_length': seq_length, + 'lazy': args.lazy_loader, + 'delim': args.delim, + 'text_key': args.text_key, + 'label_key': 'label', + 'non_binary_cols': None, + 'ds_type': args.data_set_type, + 'split': split, + 'loose': args.loose_json, + 'tokenizer_type': args.tokenizer_type, + 'tokenizer_model_path': args.tokenizer_path, + 'vocab_size': args.vocab_size, + 'model_type': args.tokenizer_model_type, + 'cache_dir': args.cache_dir, + 'max_preds_per_seq': args.max_preds_per_seq} + + eval_set_args = copy.copy(data_set_args) + eval_set_args['split'] = [1.] + # if optional eval args were set then replace their + # equivalent values in the arg dict + if eval_seq_length: + eval_set_args['seq_length'] = eval_seq_length + if args.eval_max_preds_per_seq: + eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq + if args.eval_text_key is not None: + eval_set_args['text_key'] = args.eval_text_key + + # make datasets splits and tokenizer + train = None + valid = None + test = None + + if args.train_data is not None: + train, tokenizer = data_utils.make_dataset(**data_set_args) + if data_utils.should_split(split): + train, valid, test = train + eval_set_args['tokenizer'] = tokenizer + + # make training and val dataset if necessary + if valid is None and args.valid_data is not None: + eval_set_args['path'] = args.valid_data + valid, _ = data_utils.make_dataset(**eval_set_args) + if test is None and args.test_data is not None: + eval_set_args['path'] = args.test_data + test, _ = data_utils.make_dataset(**eval_set_args) + + # wrap datasets with data loader + if train is not None and args.batch_size > 0: + train = make_data_loader(train, batch_size, args) + eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size + if valid is not None: + valid = make_data_loader(valid, eval_batch_size, args) + if test is not None: + test = make_data_loader(test, eval_batch_size, args) + + return (train, valid, test), tokenizer + +def get_split(args): + """ + Get dataset splits from comma separated string list + """ + splits = [] + if args.split.find(',') != -1: + splits = [float(s) for s in args.split.split(',')] + elif args.split.find('/') != -1: + splits = [float(s) for s in args.split.split('/')] + else: + splits = [float(args.split)] + split_total = sum(splits) + if split_total < 1.: + splits.append(1-split_total) + while len(splits) < 3: + splits.append(0.) + splits = splits[:3] + if args.valid_data is not None: + splits[1] = 0. + if args.test_data is not None: + splits[2] = 0. + final_sum = sum(splits) + return [s/final_sum for s in splits] + +def configure_data(): + + """add cmdline flags for configuring datasets""" + # These are options that are used by data_utils, but are either + # deprecated or not meant to be exposed to the command line user. + # These options are intneded to be set in code by specific scripts. + defaults = { + 'world_size': 1, + 'rank': -1, + 'persist_state': 0, + 'lazy': False, + 'shuffle': False, + 'transpose': False, + 'data_set_type': 'supervised', + 'seq_length': 256, + 'eval_seq_length': 256, + 'samples_per_shard': 100 + } + + return DataConfig(defaults=defaults) diff --git a/data_utils/__init__.py b/data_utils/__init__.py new file mode 100644 index 0000000..7a60f97 --- /dev/null +++ b/data_utils/__init__.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""utils for creating datasets""" +import os +import math + +from .samplers import DistributedBatchSampler +from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset +from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader +from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, make_tokenizer +from . import corpora + +TRAIN_DATA = 0 +VAL_DATA = 1 +TEST_DATA = 2 + +def should_split(split): + """ + given split proportions checks if should split + Examples: + >>> should_split([10,0,0]) + False + >>> should_split([1,.1,.2]) + True + """ + return max(split)/sum(split) != 1. + +def get_ext(path): + """gets path extension""" + return os.path.splitext(path)[1] + +def get_dataset(path, **kwargs): + """gets dataset object based on keyword args and file at `path`""" + if supported_corpus(path): + return corpora.NAMED_CORPORA[path](**kwargs) + ext = get_ext(path) + if ext =='.json': + text = json_dataset(path, **kwargs) + elif ext in ['.csv', '.tsv']: + text = csv_dataset(path, **kwargs) + else: + raise NotImplementedError('data file type %s is not supported'%(ext)) + return text + +def supported_corpus(corpus_name): + """checks if corpus name is defined in `corpora.py`""" + return corpus_name in corpora.NAMED_CORPORA + +def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], + delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, + tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, + model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs): + """function to create datasets+tokenizers for common options""" + if isinstance(process_fn, str): + process_fn = eval(process_fn) + if non_binary_cols is not None: + # multilabel dataset support (only for csvs) + label_key = non_binary_cols + def get_dataset_from_path(path_): + if lazy: + # get lazily loaded dataset + named_corpora = False + if supported_corpus(path_): + named_corpora = True + name = path_ + path_ = corpora.NAMED_CORPORA[path_].PATH + if not exists_lazy(path_, data_type='data'): + # create cached version of dataset for lazy loading if it doesn't exist + text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, + delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) + make_lazy(path_, text.X, data_type='data') + text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) + else: + # get dataset + text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, + delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn) + return text + # get one or multiple datasets and concatenate + if isinstance(path, str): + path = [path] + datasets = [get_dataset_from_path(p) for p in path] + if len(datasets) == 1: + ds = datasets[0] + else: + ds = ConcatDataset(datasets) + # make tokenizer for dataset + if tokenizer is None: + tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type, + pad_token, character_converage, **kwargs) + + ds_type = '' + if 'ds_type' in kwargs: + ds_type = kwargs['ds_type'] + ds.SetTokenizer(tokenizer) + # Split dataset into train/val/test (and wrap bert dataset) + if should_split(split): + ds = split_ds(ds, split) + if ds_type.lower() == 'bert': + ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length) for d in ds] + else: + if ds_type.lower() == 'bert': + ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length) + return ds, tokenizer diff --git a/data_utils/corpora.py b/data_utils/corpora.py new file mode 100755 index 0000000..334f351 --- /dev/null +++ b/data_utils/corpora.py @@ -0,0 +1,37 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""several datasets with preset arguments""" +from .datasets import json_dataset, csv_dataset + +class wikipedia(json_dataset): + """ + dataset for wikipedia with arguments configured for convenience + + command line usage: `--train-data wikipedia` + """ + PATH = '' + assert_str = "make sure to set PATH at line 27 of data_utils/corpora.py" + def __init__(self, **kwargs): + assert wikipedia.PATH != '', \ + wikipedia.assert_str + if not kwargs: + kwargs = {} + kwargs['text_key'] = 'text' + kwargs['loose_json'] = True + super(wikipedia, self).__init__(wikipedia.PATH, **kwargs) + +NAMED_CORPORA = { + 'wikipedia': wikipedia, +} diff --git a/data_utils/datasets.py b/data_utils/datasets.py new file mode 100644 index 0000000..88c2a1c --- /dev/null +++ b/data_utils/datasets.py @@ -0,0 +1,676 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""dataset objects for jsons, csvs, and BERT datasets""" + +import os +import time +from operator import itemgetter +from bisect import bisect_right +import json +import csv +import math +import random + +from torch.utils import data +import pandas as pd +import numpy as np + +import nltk +nltk.download('punkt') +from nltk import tokenize + +from .lazy_loader import lazy_array_loader, exists_lazy, make_lazy +from .tokenization import Tokenization + +class ConcatDataset(data.Dataset): + """ + Dataset to concatenate multiple datasets. + Purpose: useful to assemble different existing datasets, possibly + large-scale datasets as the concatenation operation is done in an + on-the-fly manner. + Arguments: + datasets (sequence): List of datasets to be concatenated. + """ + + @staticmethod + def cumsum(sequence): + r, s = [], 0 + for e in sequence: + l = len(e) + r.append(l + s) + s += l + return r + + def __init__(self, datasets, **kwargs): + super(ConcatDataset, self).__init__() + assert len(datasets) > 0, 'datasets should not be an empty iterable' + self.datasets = list(datasets) + self.cumulative_sizes = self.cumsum(self.datasets) + self._X = None + self._Y = None + + def SetTokenizer(self, tokenizer): + for ds in self.datasets: + ds.SetTokenizer(tokenizer) + + def GetTokenizer(self): + return self.datasets[0].GetTokenizer() + + def __len__(self): + return self.cumulative_sizes[-1] + + def __getitem__(self, idx): + dataset_idx = bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx][sample_idx] + + @property + def X(self): + if self._X is None: + self._X = [] + for data in self.datasets: + self._X.extend(data.X) + return self._X + + @property + def Y(self): + if self._Y is None: + self._Y = [] + for data in self.datasets: + self._Y.extend(list(data.Y)) + self._Y = np.array(self._Y) + return self._Y + + @property + def cummulative_sizes(self): + warnings.warn("cummulative_sizes attribute is renamed to " + "cumulative_sizes", DeprecationWarning, stacklevel=2) + return self.cumulative_sizes + +class SplitDataset(data.Dataset): + """ + Dataset wrapper to access a subset of another dataset. + Purpose: useful to index into existing datasets, possibly + large-scale datasets as the subindexing operation is done in an + on-the-fly manner. + Arguments: + ds (Dataset or array-like): List of datasets to be subindexed + split_inds (1D array-like): List of indices part of subset + """ + def __init__(self, ds, split_inds, **kwargs): + self.split_inds = list(split_inds) + self.wrapped_data = ds + self.is_lazy = isinstance(ds, lazy_array_loader) + if self.is_lazy: + self.lens = itemgetter(*self.split_inds)(list(self.wrapped_data.lens)) + self._X = None + self._Y = None + + def __len__(self): + return len(self.split_inds) + + def __getitem__(self, index): + return self.wrapped_data[self.split_inds[index]] + + def SetTokenizer(self, tokenizer): + self.wrapped_data.SetTokenizer(tokenizer) + + def GetTokenizer(self): + return self.wrapped_data.GetTokenizer() + + @property + def X(self): + if self._X is None: + self._X = itemgetter(*self.split_inds)(self.wrapped_data.X) + return self._X + + @property + def Y(self): + if self._Y is None: + self._Y = np.array(itemgetter(*self.split_inds)(self.wrapped_data.Y)) + return self._Y + + def __iter__(self): + for idx in self.split_inds: + yield self.wrapped_data[idx] + +def split_ds(ds, split=[.8,.2,.0], shuffle=True): + """ + Split a dataset into subsets given proportions of how + much to allocate per split. If a split is 0% returns None for that split. + Purpose: Useful for creating train/val/test splits + Arguments: + ds (Dataset or array-like): Data to be split. + split (1D array-like): proportions to split `ds`. `sum(splits) != 0` + shuffle (boolean): Randomly split dataset. Default: True + """ + split_sum = sum(split) + if split_sum == 0: + raise Exception('Split cannot sum to 0.') + split = np.array(split) + split /= split_sum + ds_len = len(ds) + inds = np.arange(ds_len) + if shuffle: + np.random.shuffle(inds) + start_idx = 0 + residual_idx = 0 + rtn_ds = [None]*len(split) + for i, f in enumerate(split): + if f != 0: + proportion = ds_len*split[i] + residual_idx += proportion % 1 + split_ = int(int(proportion) + residual_idx) + split_inds = inds[start_idx:start_idx+max(split_, 1)] + rtn_ds[i] = SplitDataset(ds, split_inds) + start_idx += split_ + residual_idx %= 1 + return rtn_ds + +class csv_dataset(data.Dataset): + """ + Class for loading datasets from csv files. + Purpose: Useful for loading data for unsupervised modeling or transfer tasks + Arguments: + path (str): Path to csv file with dataset. + tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None + preprocess_fn (callable): Callable that process a string into desired format. + delim (str): delimiter for csv. Default: ',' + binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False + drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty + columns with -1 (regardless if rows are dropped based on value) Default: False + text_key (str): key to get text from csv. Default: 'sentence' + label_key (str): key to get label from json dictionary. Default: 'label' + Attributes: + X (list): all strings from the csv file + Y (np.ndarray): labels to train with + """ + def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',', + binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label', + **kwargs): + self.preprocess_fn = preprocess_fn + self.SetTokenizer(tokenizer) + self.path = path + self.delim = delim + self.text_key = text_key + self.label_key = label_key + self.drop_unlabeled = drop_unlabeled + + if '.tsv' in self.path: + self.delim = '\t' + + + self.X = [] + self.Y = [] + try: + cols = [text_key] + if isinstance(label_key, list): + cols += label_key + else: + cols += [label_key] + data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1') + except: + data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1') + + data = data.dropna(axis=0) + + self.X = data[text_key].values.tolist() + try: + self.Y = data[label_key].values + except Exception as e: + self.Y = np.ones(len(self.X))*-1 + + if binarize_sent: + self.Y = binarize_labels(self.Y, hard=binarize_sent) + + def SetTokenizer(self, tokenizer): + if tokenizer is None: + self.using_tokenizer = False + if not hasattr(self, '_tokenizer'): + self._tokenizer = tokenizer + else: + self.using_tokenizer = True + self._tokenizer = tokenizer + + def GetTokenizer(self): + return self._tokenizer + + @property + def tokenizer(self): + if self.using_tokenizer: + return self._tokenizer + return None + + def __len__(self): + return len(self.X) + + def __getitem__(self, index): + """process+tokenize string and return string,label,and stringlen""" + x = self.X[index] + if self.tokenizer is not None: + x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn) + elif self.preprocess_fn is not None: + x = self.preprocess_fn(x) + y = self.Y[index] + if isinstance(y, str): + if self.tokenizer is not None: + y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn) + elif self.preprocess_fn is not None: + y = self.preprocess_fn(y) + return {'text': x, 'length': len(x), 'label': y} + + def write(self, writer_gen=None, path=None, skip_header=False): + """ + given a generator of metrics for each of the data points X_i, + write the metrics, text, and labels to a csv file + """ + if path is None: + path = self.path+'.results' + print('generating csv at ' + path) + with open(path, 'w') as csvfile: + c = csv.writer(csvfile, delimiter=self.delim) + if writer_gen is not None: + #if first item of generator is a header of what the metrics mean then write header to csv file + if not skip_header: + header = (self.label_key,)+tuple(next(writer_gen))+(self.text_key,) + c.writerow(header) + for i, row in enumerate(writer_gen): + row = (self.Y[i],)+tuple(row)+(self.X[i],) + c.writerow(row) + else: + c.writerow([self.label_key, self.text_key]) + for row in zip(self.Y, self.X): + c.writerow(row) + +class json_dataset(data.Dataset): + """ + Class for loading datasets from a json dump. + Purpose: Useful for loading data for unsupervised modeling or transfer tasks + Arguments: + path (str): path to json file with dataset. + tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None + preprocess_fn (callable): callable function that process a string into desired format. + Takes string, maxlen=None, encode=None as arguments. Default: process_str + text_key (str): key to get text from json dictionary. Default: 'sentence' + label_key (str): key to get label from json dictionary. Default: 'label' + Attributes: + all_strs (list): list of all strings from the dataset + all_labels (list): list of all labels from the dataset (if they have it) + """ + def __init__(self, path, tokenizer=None, preprocess_fn=None, binarize_sent=False, + text_key='sentence', label_key='label', loose_json=False, **kwargs): + self.preprocess_fn = preprocess_fn + self.path = path + self.SetTokenizer(tokenizer) + self.X = [] + self.Y = [] + self.text_key = text_key + self.label_key = label_key + self.loose_json = loose_json + + for j in self.load_json_stream(self.path): + s = j[text_key] + self.X.append(s) + self.Y.append(j[label_key]) + + if binarize_sent: + self.Y = binarize_labels(self.Y, hard=binarize_sent) + + def SetTokenizer(self, tokenizer): + if tokenizer is None: + self.using_tokenizer = False + if not hasattr(self, '_tokenizer'): + self._tokenizer = tokenizer + else: + self.using_tokenizer = True + self._tokenizer = tokenizer + + def GetTokenizer(self): + return self._tokenizer + + @property + def tokenizer(self): + if self.using_tokenizer: + return self._tokenizer + return None + + def __getitem__(self, index): + """gets the index'th string from the dataset""" + x = self.X[index] + if self.tokenizer is not None: + x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn) + elif self.preprocess_fn is not None: + x = self.preprocess_fn(x) + y = self.Y[index] + if isinstance(y, str): + if self.tokenizer is not None: + y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn) + elif self.preprocess_fn is not None: + y = self.preprocess_fn(y) + return {'text': x, 'length': len(x), 'label': y} + + def __len__(self): + return len(self.X) + + def write(self, writer_gen=None, path=None, skip_header=False): + """ + given a generator of metrics for each of the data points X_i, + write the metrics, text, and labels to a json file + """ + if path is None: + path = self.path+'.results' + + jsons = [] + + if writer_gen is not None: + #if first item of generator is a header of what the metrics mean then write header to csv file + def gen_helper(): + keys = {} + keys[0] = self.label_key + if not skip_header: + for idx, k in enumerate(tuple(next(writer_gen))): + keys[idx+1] = k + for i, row in enumerate(writer_gen): + if i == 0 and skip_header: + for idx, _ in enumerate(row): + keys[idx+1] = 'metric_%d'%(idx,) + j = {} + for idx, v in enumerate((self.Y[i],)+tuple(row)): + k = keys[idx] + j[k] = v + yield j + else: + def gen_helper(): + for y in self.Y: + j = {} + j[self.label_key] = y + yield j + + def out_stream(): + for i, j in enumerate(gen_helper()): + j[self.text_key] = self.X[i] + yield j + + self.save_json_stream(path, out_stream()) + + def save_json_stream(self, save_path, json_stream): + if self.loose_json: + with open(save_path, 'w') as f: + for i, j in enumerate(json_stream): + write_string = '' + if i != 0: + write_string = '\n' + write_string += json.dumps(j) + f.write(write_string) + else: + jsons = [j for j in json_stream] + json.dump(jsons, open(save_path, 'w'), separators=(',', ':')) + + def load_json_stream(self, load_path): + if not self.loose_json: + jsons = json.load(open(load_path, 'r')) + generator = iter(jsons) + else: + def gen_helper(): + with open(load_path, 'r') as f: + for row in f: + yield json.loads(row) + generator = gen_helper() + + for j in generator: + if self.label_key not in j: + j[self.label_key] = -1 + yield j + +class bert_sentencepair_dataset(data.Dataset): + """ + Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair. + Arguments: + ds (Dataset or array-like): data corpus to use for training + max_seq_len (int): maximum sequence length to use for a sentence pair + mask_lm_prob (float): proportion of tokens to mask for masked LM + max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10 + short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len + dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1) + + """ + def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None, short_seq_prob=.01, dataset_size=None, **kwargs): + self.ds = ds + self.ds_len = len(self.ds) + self.tokenizer = self.ds.GetTokenizer() + self.vocab_words = list(self.tokenizer.text_token_vocab.values()) + self.ds.SetTokenizer(None) + self.max_seq_len = max_seq_len + self.mask_lm_prob = mask_lm_prob + if max_preds_per_seq is None: + max_preds_per_seq = math.ceil(max_seq_len*mask_lm_prob /10)*10 + self.max_preds_per_seq = max_preds_per_seq + self.short_seq_prob = short_seq_prob + self.dataset_size = dataset_size + if self.dataset_size is None: + self.dataset_size = self.ds_len * (self.ds_len-1) + + def __len__(self): + return self.dataset_size + + def __getitem__(self, idx): + # get rng state corresponding to index (allows deterministic random pair) + rng = random.Random(idx) + # get seq length + target_seq_length = self.max_seq_len + short_seq = False + if rng.random() < self.short_seq_prob: + target_seq_length = rng.randint(2, target_seq_length) + short_seq = True + # get sentence pair and label + is_random_next = None + lena = 0 + lenb = 0 + while (is_random_next is None) or (lena < 1) or (lenb < 1): + tokensa, tokensb, is_random_next = self.create_random_sentencepair(target_seq_length, rng) + lena = len(tokensa[0]) + lenb = len(tokensb[0]) + # truncate sentence pair to max_seq_len + tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb, self.max_seq_len, rng) + # join sentence pair, mask, and pad + tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq, self.vocab_words, rng) + sample = {'text': np.array(tokens[0]), 'types': np.array(tokens[1]), 'is_random': int(is_random_next), 'mask': np.array(mask), 'mask_labels': np.array(mask_labels), 'pad_mask': np.array(pad_mask)} + return sample + + def sentence_split(self, document): + """split document into sentences""" + return tokenize.sent_tokenize(document) + + def sentence_tokenize(self, sent, sentence_num=0, beginning=False, ending=False): + """tokenize sentence and get token types""" + tokens = self.tokenizer.EncodeAsIds(sent).tokenization + str_type = 'str' + str(sentence_num) + token_types = [self.tokenizer.get_type(str_type).Id]*len(tokens) + return tokens, token_types + + def get_doc(self, idx): + """gets text of document corresponding to idx""" + rtn = self.ds[idx] + if isinstance(rtn, dict): + rtn = rtn['text'] + return rtn + + def create_random_sentencepair(self, target_seq_length, rng): + """ + fetches a random sentencepair corresponding to rng state similar to + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294 + """ + is_random_next = None + + curr_strs = [] + curr_str_types = [] + curr_len = 0 + + while curr_len < 1: + curr_len = 0 + doc_a = None + while doc_a is None: + doc_a_idx = rng.randint(0, self.ds_len-1) + doc_a = self.sentence_split(self.get_doc(doc_a_idx)) + if not doc_a: + doc_a = None + + random_start_a = rng.randint(0, len(doc_a)-1) + while random_start_a < len(doc_a): + sentence = doc_a[random_start_a] + sentence, sentence_types = self.sentence_tokenize(sentence, 0, random_start_a == 0, random_start_a == len(doc_a)) + curr_strs.append(sentence) + curr_str_types.append(sentence_types) + curr_len += len(sentence) + if random_start_a == len(doc_a) - 1 or curr_len >= target_seq_length: + break + random_start_a = (random_start_a+1) + + if curr_strs: + num_a = 1 + if len(curr_strs) >= 2: + num_a = rng.randint(0, len(curr_strs)) + + tokens_a = [] + token_types_a = [] + for j in range(num_a): + tokens_a.extend(curr_strs[j]) + token_types_a.extend(curr_str_types[j]) + + tokens_b = [] + token_types_b = [] + is_random_next = False + if len(curr_strs) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + b_len = 0 + while b_len < 1: + doc_b = None + while doc_b is None: + doc_b_idx = rng.randint(0, self.ds_len - 2) + doc_b_idx += int(doc_b_idx >= doc_a_idx) + + doc_b = self.sentence_split(self.get_doc(doc_b_idx)) + if not doc_b: + doc_b = None + + random_start_b = rng.randint(0, len(doc_b)-1) + while random_start_b < len(doc_b): + sentence_b = doc_b[random_start_b] + new_b_tokens, new_b_types = self.sentence_tokenize(sentence_b, 1, random_start_b == 0, random_start_b == len(doc_b)) + b_len += len(new_b_tokens) + tokens_b.extend(new_b_tokens) + token_types_b.extend(new_b_types) + if len(tokens_b) >= target_b_length: + break + random_start_b = (random_start_b+1) + else: + is_random_next = False + for j in range(num_a, len(curr_strs)): + tokens_b.extend(curr_strs[j]) + token_types_b.extend(curr_str_types[j]) + + return (tokens_a, token_types_a), (tokens_b, token_types_b), is_random_next + + def truncate_seq_pair(self, a, b, max_seq_len, rng): + """ + Truncate sequence pair according to original BERT implementation: + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391 + """ + tokens_a, token_types_a = a + tokens_b, token_types_b = b + max_num_tokens = max_seq_len - 3 + while True: + len_a = len(tokens_a) + len_b = len(tokens_b) + total_length = len_a + len_b + if total_length <= max_num_tokens: + break + if len(tokens_a) > len(tokens_b): + trunc_tokens = tokens_a + trunc_types = token_types_a + else: + trunc_tokens = tokens_b + trunc_types = token_types_b + + assert len(trunc_tokens) >= 1 + + if rng.random() < 0.5: + trunc_tokens.pop(0) + trunc_types.pop(0) + else: + trunc_tokens.pop() + trunc_types.pop() + return (tokens_a, token_types_a), (tokens_b, token_types_b) + + def mask_token(self, idx, tokens, types, vocab_words, rng): + """ + helper function to mask `idx` token from `tokens` according to + section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf + """ + label = tokens[idx] + if rng.random() < 0.8: + new_label = self.tokenizer.get_command('MASK').Id + else: + if rng.random() < 0.5: + new_label = label + else: + new_label = rng.choice(vocab_words) + + tokens[idx] = new_label + + return label + + def pad_seq(self, seq): + """helper function to pad sequence pair""" + num_pad = max(0, self.max_seq_len - len(seq)) + pad_mask = [0] * len(seq) + [1] * num_pad + seq += [self.tokenizer.get_command('pad').Id] * num_pad + return seq, pad_mask + + def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng): + """ + Mask sequence pair for BERT training according to: + https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338 + """ + tokens_a, token_types_a = a + tokens_b, token_types_b = b + tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id] + token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]] + + len_a = len(tokens_a) + len_b = len(tokens_b) + + cand_indices = [idx+1 for idx in range(len_a)] + [idx+2+len_a for idx in range(len_b)] + + rng.shuffle(cand_indices) + + output_tokens, pad_mask = self.pad_seq(list(tokens)) + output_types, _ = self.pad_seq(list(token_types)) + + num_to_predict = min(max_preds_per_seq, max(1, int(round(len(tokens) * mask_lm_prob)))) + + mask = [0] * len(output_tokens) + mask_labels = [-1] * len(output_tokens) + + for idx in sorted(cand_indices[:num_to_predict]): + mask[idx] = 1 + label = self.mask_token(idx, output_tokens, output_types, vocab_words, rng) + mask_labels[idx] = label + + return (output_tokens, output_types), mask, mask_labels, pad_mask diff --git a/data_utils/file_utils.py b/data_utils/file_utils.py new file mode 100755 index 0000000..44ecc41 --- /dev/null +++ b/data_utils/file_utils.py @@ -0,0 +1,253 @@ +# This file is provided as is from: +# https://github.com/huggingface/pytorch-pretrained-BERT +# Please refer to their repository for copyright. + +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" +from __future__ import (absolute_import, division, print_function, unicode_literals) + +import json +import logging +import os +import shutil +import tempfile +from functools import wraps +from hashlib import sha256 +import sys +from io import open + +import boto3 +import requests +from botocore.exceptions import ClientError +from tqdm import tqdm + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + +try: + from pathlib import Path + PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + Path.home() / '.pytorch_pretrained_bert')) +except (AttributeError, ImportError): + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +def url_to_filename(url, etag=None): + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + """ + url_bytes = url.encode('utf-8') + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode('utf-8') + etag_hash = sha256(etag_bytes) + filename += '.' + etag_hash.hexdigest() + + return filename + + +def filename_to_url(filename, cache_dir=None): + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise EnvironmentError("file {} not found".format(cache_path)) + + meta_path = cache_path + '.json' + if not os.path.exists(meta_path): + raise EnvironmentError("file {} not found".format(meta_path)) + + with open(meta_path, encoding="utf-8") as meta_file: + metadata = json.load(meta_file) + url = metadata['url'] + etag = metadata['etag'] + + return url, etag + + +def cached_path(url_or_filename, cache_dir=None): + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ('http', 'https', 's3'): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, cache_dir) + elif os.path.exists(url_or_filename): + # File, and it exists. + return url_or_filename + elif parsed.scheme == '': + # File, but it doesn't exist. + raise EnvironmentError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) + + +def split_s3_path(url): + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise EnvironmentError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url): + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url, temp_file): + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url, temp_file): + req = requests.get(url, stream=True) + content_length = req.headers.get('Content-Length') + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache(url, cache_dir=None): + """ + Given a URL, look for the corresponding dataset in the local cache. + If it's not there, download it. Then return the path to the cached file. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url) + else: + response = requests.head(url, allow_redirects=True) + if response.status_code != 200: + raise IOError("HEAD request failed for url {} with status code {}" + .format(url, response.status_code)) + etag = response.headers.get("ETag") + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + if not os.path.exists(cache_path): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with tempfile.NamedTemporaryFile() as temp_file: + logger.info("%s not found in cache, downloading to %s", url, temp_file.name) + + # GET file object + if url.startswith("s3://"): + s3_get(url, temp_file) + else: + http_get(url, temp_file) + + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + # shutil.copyfileobj() starts at the current position, so go to the start + temp_file.seek(0) + + logger.info("copying %s to cache at %s", temp_file.name, cache_path) + with open(cache_path, 'wb') as cache_file: + shutil.copyfileobj(temp_file, cache_file) + + logger.info("creating metadata file for %s", cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w', encoding="utf-8") as meta_file: + json.dump(meta, meta_file) + + logger.info("removing temp file %s", temp_file.name) + + return cache_path + + +def read_set_from_file(filename): + ''' + Extract a de-duped collection (set) of text from a file. + Expected file format is one item per line. + ''' + collection = set() + with open(filename, 'r', encoding='utf-8') as file_: + for line in file_: + collection.add(line.rstrip()) + return collection + + +def get_file_extension(path, dot=True, lower=True): + ext = os.path.splitext(path)[1] + ext = ext if dot else ext[1:] + return ext.lower() if lower else ext diff --git a/data_utils/lazy_loader.py b/data_utils/lazy_loader.py new file mode 100644 index 0000000..3645e7c --- /dev/null +++ b/data_utils/lazy_loader.py @@ -0,0 +1,195 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""utils for loading text from disk""" +import os +import mmap +import pickle as pkl +import time +from itertools import accumulate +from threading import Lock + +import torch + +def get_lazy_path(path): + """ + Gets directory path where lazy files are stored. + """ + return os.path.splitext(path)[0]+'.lazy' + +def exists_lazy(path, data_type='data'): + """ + Check if we've already made a lazy version of this file for the `data_type` field. + """ + if not os.path.exists(get_lazy_path(path)): + return False + contents = os.listdir(get_lazy_path(path)) + if data_type not in contents: + return False + if data_type+'.len.pkl' not in contents: + return False + return True + +def make_lazy(path, strs, data_type='data'): + """ + Make lazy version of `data_type` field of the file. Byte offsets + corresponding to data indices are stored in a `.len.pkl` data file. + """ + lazypath = get_lazy_path(path) + if not os.path.exists(lazypath): + os.makedirs(lazypath) + datapath = os.path.join(lazypath, data_type) + lenpath = os.path.join(lazypath, data_type+'.len.pkl') + if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + with open(datapath, 'wb') as f: + str_lens = [] + str_cnt = 0 + for s in strs: + if isinstance(s, dict): + s = s['text'] + encoded = s.encode('utf-8') + f.write(encoded) + str_cnt = len(encoded) + str_lens.append(str_cnt) + pkl.dump(str_lens, open(lenpath, 'wb')) + else: + while not os.path.exists(lenpath): + time.sleep(1) + +def split_strings(strings, start, chr_lens): + """ + Split strings based on string lengths and given start. + """ + return [strings[i-start:j-start] for i, j in zip([start]+chr_lens[:-1], chr_lens)] + +class ProcessorTokenizer: + """ + callable class that runs a preprocessing, as well as tokenization step, + on input text. + """ + def __init__(self, tokenizer, process_fn=None): + self.tokenizer = tokenizer + self.process_fn = process_fn + + def __call__(self, string): + if self.tokenizer is not None: + string = self.tokenizer(string, process_fn=self.process_fn) + elif self.process_fn is not None: + string = self.process_fn(string) + return string + +class lazy_array_loader(object): + """ + Arguments: + path: path to directory where array entries are concatenated into one big string file + and the .len file are located + data_type (str): Some datsets have multiple fields that are stored in different paths. + `data_type` specifies which of these fields to load in this class + mem_map (boolean): Specifies whether to memory map file `path` + map_fn (callable): Fetched strings are passed through map_fn before being returned. + + Example of lazy loader directory structure: + file.json + file.lazy/ + data_type1 + data_type1.len.pkl + data_type2 + data_type2.len.pkl + """ + def __init__(self, path, data_type='data', mem_map=False, map_fn=None): + lazypath = get_lazy_path(path) + datapath = os.path.join(lazypath, data_type) + #get file where array entries are concatenated into one big string + self._file = open(datapath, 'rb') + self.file = self._file + #memory map file if necessary + self.mem_map = mem_map + if self.mem_map: + self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ) + lenpath = os.path.join(lazypath, data_type+'.len.pkl') + self.lens = pkl.load(open(lenpath, 'rb')) + self.ends = list(accumulate(self.lens)) + self.dumb_ends = list(self.ends) + self.read_lock = Lock() + self.process_fn = map_fn + self.map_fn = map_fn + self._tokenizer = None + + def SetTokenizer(self, tokenizer): + """ + logic to set and remove (set to None) tokenizer. + combines preprocessing/tokenization into one callable. + """ + if tokenizer is None: + if not hasattr(self, '_tokenizer'): + self._tokenizer = tokenizer + else: + self._tokenizer = tokenizer + self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn) + + def GetTokenizer(self): + return self._tokenizer + + def __getitem__(self, index): + """ + read file and splice strings based on string ending array `self.ends` + """ + if not isinstance(index, slice): + if index == 0: + start = 0 + else: + start = self.ends[index-1] + end = self.ends[index] + rtn = self.file_read(start, end) + if self.map_fn is not None: + return self.map_fn(rtn) + else: + # if slice, fetch strings with 1 diskread and then splice in memory + chr_lens = self.ends[index] + if index.start == 0 or index.start is None: + start = 0 + else: + start = self.ends[index.start-1] + stop = chr_lens[-1] + strings = self.file_read(start, stop) + rtn = split_strings(strings, start, chr_lens) + if self.map_fn is not None: + return self.map_fn([s for s in rtn]) + return rtn + + def __len__(self): + return len(self.ends) + + def file_read(self, start=0, end=None): + """read specified portion of file""" + + # atomic reads to avoid race conditions with multiprocess dataloader + self.read_lock.acquire() + # seek to start of file read + self.file.seek(start) + # read to end of file if no end point provided + if end is None: + rtn = self.file.read() + #else read amount needed to reach end point + else: + rtn = self.file.read(end-start) + self.read_lock.release() + #TODO: @raulp figure out mem map byte string bug + #if mem map'd need to decode byte string to string + rtn = rtn.decode('utf-8') + # rtn = str(rtn) + if self.mem_map: + rtn = rtn.decode('unicode_escape') + return rtn + diff --git a/data_utils/samplers.py b/data_utils/samplers.py new file mode 100644 index 0000000..4e08690 --- /dev/null +++ b/data_utils/samplers.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""batch samplers that work with either random or sequential data samplers""" +import math +import os +import sys + +import torch +from torch.utils import data +import numpy as np + +class DistributedBatchSampler(data.sampler.BatchSampler): + """ + similar to normal implementation of distributed sampler, except implementation is at the + batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary + data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. + """ + def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False): + super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) + if rank == -1: + rank = torch.distributed.get_rank() + self.rank = rank + self.world_size = world_size + self.sampler.wrap_around = 0 + self.wrap_around = 0 + self.wrap_last = wrap_last + self.start_iter = 0 + + def __iter__(self): + batch = [] + last_batch = None + i = 0 + for idx in self.data_iterator(self.sampler, wrap_around=False): + batch.append(idx) + if len(batch) == self.batch_size: + tbatch = self._batch(batch) + if i >= self.start_iter: + yield tbatch + self.start_iter = 0 + i += 1 + last_batch = np.array(list(tbatch)) + batch = [] + batch_len = len(batch) + if batch_len > 0 and not self.drop_last: + if self.wrap_last: + self.sampler.wrap_around -= (self.batch_size) + self.wrap_around += (len(batch)) + self.wrap_around %= self.batch_size + if isinstance(self.sampler, TransposedSampler): + for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)): + if i == 0: + continue + batch.append(idx) + new_batch_len = len(batch) + if len(batch) == self.batch_size: + break + yield self._batch(batch) + if self.wrap_last: + self.sampler.wrap_around += self.batch_size + + def data_iterator(self, _iter, wrap_around=False): + """iterates through data and handles wrap around""" + for i, idx in enumerate(_iter): + if i < self.wrap_around%self.batch_size: + continue + if wrap_around: + self.wrap_around += 1 + self.wrap_around %= self.batch_size + yield idx + + def _batch(self, batch): + """extracts samples only pertaining to this worker's batch""" + start = self.rank*self.batch_size//self.world_size + end = (self.rank+1)*self.batch_size//self.world_size + return batch[start:end] \ No newline at end of file diff --git a/data_utils/tf_dl.py b/data_utils/tf_dl.py new file mode 100755 index 0000000..a29376f --- /dev/null +++ b/data_utils/tf_dl.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DataLoader for TFRecords""" + +import tensorflow as tf +tf.enable_eager_execution() +import torch + +class TFRecordDataLoader(object): + def __init__(self, records, batch_size, max_seq_len, max_preds_per_seq, train, num_workers=2, seed=1): + assert max_preds_per_seq is not None, "--max-preds-per-seq MUST BE SPECIFIED when using tfrecords" + tf.set_random_seed(seed) + if isinstance(records, str): + records = [records] + + self.record_converter = Record2Example({"input_ids": tf.FixedLenFeature([max_seq_len], tf.int64), + "input_mask": tf.FixedLenFeature([max_seq_len], tf.int64), + "segment_ids": tf.FixedLenFeature([max_seq_len], tf.int64), + "masked_lm_positions": tf.FixedLenFeature([max_preds_per_seq], tf.int64), + "masked_lm_ids": tf.FixedLenFeature([max_preds_per_seq], tf.int64), + "masked_lm_weights": tf.FixedLenFeature([max_preds_per_seq], tf.float32), + "next_sentence_labels": tf.FixedLenFeature([1], tf.int64)}) + + #Instantiate dataset according to original BERT implementation + if train: + self.dataset = tf.data.Dataset.from_tensor_slices(tf.constant(records)) + self.dataset = self.dataset.repeat() + self.dataset = self.dataset.shuffle(buffer_size=len(records)) + + # use sloppy tfrecord dataset + self.dataset = self.dataset.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=train, + cycle_length=min(num_workers, len(records)))) + self.dataset = self.dataset.shuffle(buffer_size=100) + else: + self.dataset = tf.data.TFRecordDataset(records) + self.dataset = self.dataset.repeat() + + # Instantiate dataloader (do not drop remainder for eval) + loader_args = {'batch_size': batch_size, + 'num_parallel_batches': num_workers, + 'drop_remainder': train} + self.dataloader = self.dataset.apply(tf.contrib.data.map_and_batch(self.record_converter, **loader_args)) + + def __iter__(self): + data_iter = iter(self.dataloader) + for item in data_iter: + yield convert_tf_example_to_torch_tensors(item) + +class Record2Example(object): + def __init__(self, feature_map): + self.feature_map = feature_map + + def __call__(self, record): + """Decodes a BERT TF record to a TF example.""" + example = tf.parse_single_example(record, self.feature_map) + for k, v in list(example.items()): + if v.dtype == tf.int64: + example[k] = tf.to_int32(v) + return example + +def convert_tf_example_to_torch_tensors(example): + item = {k: torch.from_numpy(v.numpy()) for k,v in example.items()} + mask = torch.zeros_like(item['input_ids']) + mask_labels = torch.ones_like(item['input_ids'])*-1 + for b, row in enumerate(item['masked_lm_positions'].long()): + for i, idx in enumerate(row): + if item['masked_lm_weights'][b, i] != 0: + mask[b, idx] = 1 + mask_labels[b, idx] = item['masked_lm_ids'][b, i] + return {'text': item['input_ids'], 'types': item['segment_ids'],'is_random': item['next_sentence_labels'], + 'pad_mask': 1-item['input_mask'], 'mask': mask, 'mask_labels': mask_labels} + diff --git a/data_utils/tokenization.py b/data_utils/tokenization.py new file mode 100755 index 0000000..87f7f9c --- /dev/null +++ b/data_utils/tokenization.py @@ -0,0 +1,788 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)""" +from collections import namedtuple +import random +import os +import csv + +import nltk +nltk.download('punkt') +from nltk import tokenize as nltk_tokenize +import sentencepiece as spm + +from .wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP + +def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs): + """ + Helper function to instantiate a tokenizer given common combinations of options. + """ + tokenizer_class = tokenizer_type + if isinstance(tokenizer_class, str): + tokenizer_class = eval(tokenizer_class) + if tokenizer_class is BertWordPieceTokenizer: + return BertWordPieceTokenizer(model_type, **kwargs) + text_tokenizer = tokenizer_class(corpus=corpus, vocab_size=vocab_size, model_path=model_path, model_type=model_type, + pad_token=pad_token, character_coverage=character_coverage) + return Tokenizer(text_tokenizer, command_tokens, type_tokens) + +class Tokenization(object): + """ + Tokenization object to hold tokenization, (processed text),and original + text. Can hold tokenization as Ids or tokens. + + It also holds command tokens (pad, unk, etc.) for the tokenization. + This allows functions to pad/operate on tokenizations without having + access to the full tokenizer, just the tokenization. + + Several standard array operations are implemented (insert, append, extend). + """ + def __init__(self, tokenization, text=None, original_text=None, command_tokens=None, asIds=True): + self.tokenization = tokenization + self.text = text + if self.text is None: + self.text = self.tokenization + self.original_text = original_text + if self.original_text is None: + self.original_text = self.text + self.command_tokens = command_tokens + self.asIds = asIds + self.parse_command_tokens() + + def set_command_tokens(self, command_tokens): + self.command_tokens = command_tokens + return self.parse_command_tokens() + + def parse_command_tokens(self): + if self.command_tokens is None: + return + for command_token in self.command_tokens: + if self.asIds: + setattr(self, command_token.name, command_token.Id) + else: + setattr(self, command_token.name, command_token.token) + + def __getitem__(self, index): + return self.tokenization[index] + + def __len__(self): + return len(self.tokenization) + + def insert(self, idx, other): + if isinstance(other, (CommandToken, TypeToken)): + self.tokenization.insert(idx, other.Id) + if idx == 0: + self.text.insert(0, other.token) + self.original_text.insert(0, other.token) + elif idx == len(self.tokenization)-1: + self.text.insert(-1, other.token) + self.original_text.insert(-1, other.token) + elif isinstance(other, Tokenization): + self.tokenization = self.tokenization[:idx] + other.tokenization + self.tokenization[idx:] + else: + self.tokenization = self.tokenization[:idx] + other.tokenization + self.tokenization[idx:] + + def append(self, other): + if isinstance(other, (CommandToken, TypeToken)): + self.tokenization.append(other.Id) + self.text.append(other.token) + self.original_text.append(other.token) + elif isinstance(other, Tokenization): + self.tokenization.extend(other.tokenization) + self.text += other.text + self.original_text += other.original_text + else: + self.tokenization.append(other) + return self + + def extend(self, other): + if isinstance(other, (CommandToken, TypeToken)): + self.tokenization.append(other.Id) + self.text.append(other.token) + self.original_text.append(other.token) + elif isinstance(other, list) and isinstance(other[0], (CommandToken, TypeToken)): + self.tokenization.extend([o.Id for o in other]) + self.text += [o.token for o in other] + self.original_text += [o.token for o in other] + elif isinstance(other, Tokenization): + self.tokenization.extend(other.tokenization) + self.text += other.text + self.original_text += other.original_text + else: + self.tokenization.extend(other) + return self + +"""define some default command tokens for the tokenizer to use""" +token_format = "<{0}>" + +COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id')) + +def prep_command_tokens(tokenlist, token_format=token_format): + return [CommandToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist] + +class CommandToken(object): + def __init__(self, name, token, Id): + self.name = name + self.token = token + self.Id = Id + + def __str__(self): + return str(COMMAND_TUPLE(self.name, self.token, self.Id)) + +DEFAULT_COMMAND_TOKENS = [ + ('pad', 0), + ('eos', 1), + ('bos', 2), + ('unk', 3), + ('sep', 4), + ('L2R', 5), + ('ENC', 6), + ('MASK', 7), +] +DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) + +"""define some default type tokens for bert training""" + +TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id')) + +def prep_type_tokens(tokenlist, token_format=token_format): + return [TypeToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist] + +class TypeToken(object): + def __init__(self, name, token, Id): + self.name = name + self.token = token + self.Id = Id + + def __str__(self): + return str(TYPE_TUPLE(self.name, self.token, self.Id)) + +DEFAULT_TYPE_TOKENS = [ + ('function', 0), + ('command', 1), + ('str0', 2), + ('str1', 3), + ('str2', 4), + ('embedding0', 5), + ('embedding1', 6), + ('embedding2', 7), + ('arg0', 8), + ('arg1', 9), + ('arg2', 10), +] +DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS) + +class Tokenizer(object): + """ + Tokenizer object that handles text tokenization, command tokens, and type tokens. + + Command tokens and text tokens are stored together in one mapping of size + `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first + `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. + + Token types are stored in a separate mapping of size `len(type_tokens)`. + """ + def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None): + # set text tokenizer + self.text_tokenizer = text_tokenizer + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = len(self.text_tokenizer) + + # set command tokens + if command_tokens is None: + command_tokens = DEFAULT_COMMAND_TOKENS + self._command_tokens = command_tokens + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = {tok.token: tok for tok in self._command_tokens} + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + if not hasattr(self, 'num_command_tokens'): + self.num_command_tokens = len(self._command_tokens) + if not hasattr(self, 'num_tokens'): + self.num_tokens = self.num_command_tokens + self.num_text_tokens + + # set type tokens + if type_tokens is None: + type_tokens = DEFAULT_TYPE_TOKENS + self.type_tokens = type_tokens + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + if not hasattr(self, 'num_type_tokens'): + self.num_type_tokens = len(self.type_tokens) + + # parse tokens and vocabs from tokenizer + self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens) + self._vocab = {t:Id for Id,t in self.command_id_map.items()} + self._vocab.update({t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()}) + + self._text_tokens = list(self.text_tokenizer.tokens) + self._text_token_vocab = {t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()} + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()} + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()} + + + def __call__(self, text, process_fn=None): + """run preprocessing and encode text as Ids""" + return self.EncodeAsIds(text, process_fn=process_fn) + + def __len__(self): + """total number of tokens""" + return self.num_tokens + + def get_command(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name] + + def get_type(self, name): + """get type token corresponding to `name`""" + return self.type_name_map[name] + + @property + def tokens(self): + """list (or iterable) of all tokens for tokenizer""" + return self._tokens + + @property + def vocab(self): + """dictionary mapping tokens to ids for tokenizer""" + return self._vocab + + @property + def token_types(self): + """list (or iterable) of all token types for tokenizer""" + return self._token_types + + @property + def token_type_vocab(self): + """dictionary mapping token types to ids for tokenizer""" + return self._token_type_vocab + + @property + def command_tokens(self): + """list (or iterable) of all command tokens for tokenizer""" + return self._command_token_tokens + + @property + def command_token_vocab(self): + """dictionary mapping command tokens to ids for tokenizer""" + return self._command_token_vocab + + @property + def text_tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + return self._text_tokens + + @property + def text_token_vocab(self): + """dictionary mapping text tokens to ids for text tokenizer""" + return self._text_token_vocab + + def EncodeAsIds(self, text, process_fn=None): + """ + encode text using text tokenizer and shift Id values for command tokens + """ + tokenization = self.text_tokenizer.EncodeAsIds(text, process_fn=process_fn) + tokenization.tokenization = [t+self.num_command_tokens for t in tokenization.tokenization] + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def EncodeAsTokens(self, text, process_fn=None): + """ + encode text as tokens using text tokenizer + """ + tokenization = self.text_tokenizer.EncodeAsTokens(text, process_fn=process_fn) + tokenization.set_command_tokens(self._command_tokens) + return tokenization + + def IdToToken(self, Id, type_token=False): + """convert Id to token accounting for command and type tokens""" + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + if Id < self.num_command_tokens: + return self.command_id_map[Id].token + return self.text_tokenizer.IdToToken(Id-self.num_command_tokens) + + def TokenToId(self, token, type_token=False): + """convert token to Id accounting for command and type tokens""" + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + if token in self.command_token_map: + return self.command_token_map[token].Id + return self.text_tokenizer.TokenToId(token)+self.num_command_tokens + + def DecodeIds(self, Ids, type_token=False): + """ + convert Ids to tokens accounting for command and type tokens, tokens + are joined and returned as a string. + """ + if type_token: + return ' '.join(Id.token if isinstance(Id, TypeToken) else self.type_id_map[Id].token for Id in Ids) + rtn_strs = [] + current_str = [] + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + for Id in Ids: + if isinstance(Id, CommandToken): + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + current_str = [] + rtn_strs.append(t.token) + elif Id < self.num_command_tokens: + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + current_str = [] + rtn_strs.append(self.command_id_map[Id].token) + else: + current_str.append(Id - self.num_command_tokens) + if current_str != []: + rtn_strs.append(self.text_tokenizer.DecodeIds(current_str)) + return ' '.join(rtn_strs) + + def DecodeTokens(self, Tokens, type_token=False): + """ + convert tokens to a string accounting for command and type tokens. + """ + if type_token: + return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens) + rtn_strs = [] + current_str = [] + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + for t in Tokens: + if isinstance(t, CommandToken): + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + current_str = [] + rtn_strs.append(t.token) + elif t in self.command_token_map: + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + current_str = [] + rtn_strs.append(t) + else: + current_str.append(t) + if current_str != []: + rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str)) + return ' '.join(rtn_strs) + +class TextTokenizer(object): + """ + Interface for text tokenizer + """ + def __init__(self): + if not hasattr(self, 'num_text_tokens'): + self.num_text_tokens = 0 + if not hasattr(self, 'num_tokens'): + self.num_tokens = self.num_text_tokens + + def __call__(self, text, process_fn=None): + return self.EncodeAsIds(text, process_fn) + + def __len__(self): + return self.num_text_tokens + + @property + def tokens(self): + """list (or iterable) of text tokens for text tokenizer""" + raise NotImplementedError('TextTokenizer tokens property not implemented') + + @property + def vocab(self): + """dictionary mapping tokens to ids""" + raise NotImplementedError('TextTokenizer vocab property not implemented') + + @staticmethod + def exists(model_path): + """check if the filepath for a text tokenizer exists""" + raise NotImplementedError('TextTokenizer exists method not implemented') + + def Train(self, corpus): + """train a tokenizer on a data corpus and save model for future use""" + raise NotImplementedError('TextTokenizer Train not implemented') + + def EncodeAsIds(self, text, process_fn=None): + """ + Preprocess text and encode as ids. Return a tokenization object with + original text, processed text, and id tokenization. + """ + raise NotImplementedError('TextTokenizer EncodeAsIds not implemented') + + def EncodeAsTokens(self, text, process_fn=None): + """ + Preprocess text and encode as tokens. Return a tokenization object with + original text, processed text, and token tokenization. + """ + raise NotImplementedError('TextTokenizer EncodeAsTokens not implemented') + + def IdToToken(self, Id): + """Convert an Id to Token. Reverse lookup of self.vocab""" + raise NotImplementedError('TextTokenizer IdToToken not implemented') + + def TokenToId(self, token): + """Convert a Token to Id. Lookup of self.vocab""" + raise NotImplementedError('TextTokenizer TokenToId not implemented') + + def DecodeIds(self, Ids): + """Convert a list or tokenization object of Ids to a text string""" + raise NotImplementedError('TextTokenizer DecodeIds not implemented') + + def DecodeTokens(self, Tokens): + """Convert a list or tokenization object of tokens to a text string""" + raise NotImplementedError('TextTokenizer DecodeTokens not implemented') + + +class CharacterLevelTokenizer(TextTokenizer): + """ + Text tokenizer for ASCII-256 Character Level Tokenization. + """ + def __init__(self, **kwargs): + self.num_text_tokens = 256 + super(CharacterLevelTokenizer, self).__init__() + self._tokens = [self.IdToToken(Id) for Id in range(self.num_text_tokens)] + self._vocab = {t: i for i,t in enumerate(self._tokens)} + + def __len__(self): + return 256 + + @staticmethod + def exists(model_path): + return True + + def Train(self, corpus): + pass + + @property + def tokens(self): + return self._tokens + + @property + def vocab(self): + return self._vocab + + def EncodeAsIds(self, text, process_fn=None): + """convert text to ascii 256 Ids""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + processed_text = str(processed_text) + tokens = [self.TokenToId(c) for c in processed_text] + return Tokenization(tokens, processed_text, text) + + def EncodeAsTokens(self, text, process_fn=None): + """convert text to ascii 256 characters""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + processed_text = str(processed_text) + tokens = [c for c in processed_text] + return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id): + """ascii index to character""" + return chr(Id) + + def TokenToId(self, token): + """ascii character to index""" + return ord(token) + + def DecodeIds(self, Ids): + """converts ascii ids to tokens before joining them into text""" + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + return ''.join([self.IdToToken(tok) for tok in Ids]) + + def DecodeTokens(self, Tokens): + """just concatenates ascii tokens into text""" + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return ''.join(Tokens) + + +MAX_SENTENCEPIECE_SENTENCES = 100000000 + +def get_corpus_freq(dataset, filepath, filetype='tsv'): + """ + Take corpus, split it into sentences, and extract word frequencies. + Write frequencies to `filepath` as a tsv. Only write the first + MAX_SENTENCEPIECE_SENTENCES most common words to the file. + """ + if filetype == 'tsv': + delimiter = '\t' + else: + delimiter = ',' + + print("compute corpus frequency\n", flush=True) + + total_sentence_count = 0 + maxlen = 0 + freqs = {} + for entry in dataset: + if isinstance(entry, dict): + entry = entry['text'] + lines = entry.strip().split('\n') + for line in lines: + sentences = nltk_tokenize.sent_tokenize(line) + total_sentence_count += len(sentences) + for sentence in sentences: + maxlen = max(len(line), maxlen) + for word in sentence.split(): + if word not in freqs: + freqs[word] = 0 + freqs[word] += 1 + + print("length of freqs before truncating " + str(len(freqs)), flush=True) + print("file path for freq " + str(filepath), flush=True) + + freqs_sorted = {} + counter=0 + for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True): + if counter >= MAX_SENTENCEPIECE_SENTENCES: + break + counter+=1 + freqs_sorted[word] = count + + + print("length of freqs after trancating " + str(len(freqs_sorted)), flush=True) + + with open(filepath, 'w') as f: + writer = csv.writer(f, delimiter=delimiter) + for k, v in freqs_sorted.items(): + writer.writerow([str(k), str(v)]) + + return total_sentence_count, maxlen + +class SentencePieceTokenizer(TextTokenizer): + """Trains and uses sentencepiece for text tokenization""" + def __init__(self, model_type='bpe', vocab_size=None, corpus=None, model_path=None, character_coverage=1.0, **kwargs): + self.character_coverage = character_coverage + self.model_type = model_type.lower() + self.spm_model = model_path + self.num_text_tokens = vocab_size + make_train = not SentencePieceTokenizer.exists(self.spm_model) + if make_train: + assert corpus is not None and self.num_text_tokens is not None + self.Train(corpus, self.num_text_tokens) + self._tokens = [] + self._vocab = {} + self.load_spm_model() + super(SentencePieceTokenizer, self).__init__() + + def __len__(self): + return self.num_text_tokens + + @property + def tokens(self): + return self._tokens + + @property + def vocab(self): + return self._vocab + + @staticmethod + def exists(model_path): + if model_path is None: + return False + # check if path exists + dne = not os.path.exists(model_path) + # check if path.model exists + if dne and not model_path.endswith('.model'): + dne = not os.path.exists(model_path+'.model') + return not dne + + def load_spm_model(self): + """load sentencepiece model and parse vocab""" + if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'): + self.spm_model = self.spm_model+'.model' + self.sp = spm.SentencePieceProcessor() + self.sp.Load(self.spm_model) + self.vocab_size = self.num_text_tokens = len(self.sp) + self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)] + self._vocab = {t: i for i,t in enumerate(self._tokens)} + + def Train(self, corpus, num_text_tokens): + """train sentencepiece model on corpus using word frequencies""" + self.num_text_tokens = num_text_tokens + use_model_path = self.spm_model + random_hash = str(random.randint(0, 2147483647)) + if use_model_path is None: + use_model_path = random_hash + if use_model_path.endswith('.model'): + use_model_path = use_model_path[:use_model_path.rfind('.model')] + input_path = use_model_path+'.tsv.'+random_hash + line_count, maxlenline = get_corpus_freq(corpus, input_path) + line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES) + print('line count used as input_sentence_size ', line_count, flush=True) + print('training sentencepiece model', flush=True) + train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \ + + ' --model_type={model_type} --character_coverage={character_coverage} ' \ + + '--input_sentence_size={input_sentence_size} ' \ + + '--input_format=tsv' + train_string = train_string.format(file_path=input_path, model_prefix=use_model_path, vocab_size=num_text_tokens, + model_type=self.model_type, character_coverage=self.character_coverage, + input_sentence_size=int(line_count)) #, #)#, + print("calling spm.SentencePieceTrainer.Train(%s)"%(train_string), flush=True) + spm.SentencePieceTrainer.Train(train_string) + os.remove(input_path) + self.spm_model = use_model_path+'.model' + print('sentencepiece model written to '+self.spm_model, flush=True) + + def EncodeAsIds(self, text, process_fn=None): + """convert text to sentencepiece Ids""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.sp.EncodeAsIds(processed_text) + return Tokenization(tokens, processed_text, text) + + def EncodeAsTokens(self, text, process_fn=None): + """convert text to sentencepiece tokens""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.sp.EncodeAsTokens(processed_text) + return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id): + """convert Id to sentencpiece token""" + return self.sp.IdToPiece(Id) + + def TokenToId(self, token): + """convert sentencpiece token to Id""" + return self.sp.PieceToId(token) + + def DecodeIds(self, Ids): + """converts ids to a text string""" + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + return self.sp.DecodeIds(Ids) + + def DecodeTokens(self, Tokens): + """converts sentencepiece tokens to a text string""" + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return self.sp.DecodeTokens(Tokens) + +class BertWordPieceTokenizer(Tokenizer): + """ + Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization + in BERT training. Default to bert-large-uncased tokenizer. + """ + def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs): + # default to bert-large-uncased tokenizer + if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP: + tokenizer_model_type = 'bert-large-uncased' + print('loading BertWordPieceTokenizer (', tokenizer_model_type, ') from cache_dir ', cache_dir) + do_lower_case = not ('-cased' in tokenizer_model_type or 'chinese' in tokenizer_model_type) + self.text_tokenizer = BertTokenizer.from_pretrained(tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir) + print('loaded', tokenizer_model_type) + # disable max len warnings by increasing max len + self.text_tokenizer.max_len = int(1e12) + + # set command tokens from wordpiece tokenizer values + self.num_command_tokens = 5 + self.num_tokens = len(self.text_tokenizer.vocab) + self.num_text_tokens = self.num_tokens-5 + self.num_type_tokens = 2 + + self._command_tokens = [ + CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']), + CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']), + CommandToken('MASK', '[MASK]', self.text_tokenizer.vocab['[MASK]']), + CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']), + CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']), + ] + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = {tok.token: tok for tok in self._command_tokens} + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + + # set type tokens + self.type_tokens = [ + TypeToken('str0', '', 0), + TypeToken('str1', '', 1), + ] + self.type_name_map = {tok.name: tok for tok in self.type_tokens} + self.type_token_map = {tok.token: tok for tok in self.type_tokens} + self.type_id_map = {tok.Id: tok for tok in self.type_tokens} + + # parse tokens and vocabs from tokenizer + + self._tokens = list(self.text_tokenizer.vocab.keys()) + self._vocab = {k:v for k,v in self.text_tokenizer.vocab.items()} + + self._text_tokens = list(self._tokens) + self._text_token_vocab = {k:v for k,v in self.text_tokenizer.vocab.items()} + + self._command_token_tokens = list(self.command_token_map.keys()) + self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()} + + self._token_types = list(self.type_token_map.keys()) + self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()} + + def EncodeAsIds(self, text, process_fn=None): + """convert text to wordpiece Ids""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.text_tokenizer.tokenize(processed_text) + Ids = self.text_tokenizer.convert_tokens_to_ids(tokens) + return Tokenization(Ids, processed_text, text) + + def EncodeAsTokens(self, text, process_fn=None): + """convert wordpiece token to Id""" + processed_text = text + if process_fn is not None: + processed_text = process_fn(processed_text) + tokens = self.text_tokenizer.tokenize(processed_text) + return Tokenization(tokens, processed_text, text, asIds=False) + + def IdToToken(self, Id, type_token=False): + """convert Id to sentencpiece token""" + if isinstance(Id, (TypeToken, CommandToken)): + return Id.token + if type_token: + return self.type_id_map[Id].token + return self.text_tokenizer.ids_to_tokens[Id] + + def TokenToId(self, token, type_token=False): + """convert sentencpiece token to Id""" + if isinstance(token, (TypeToken, CommandToken)): + return token.Id + if type_token: + return self.type_token_map[token].Id + return self.text_tokenizer.vocab[token] + + def DecodeIds(self, Ids, type_token=False): + """converts ids to wordpiece tokens and joins them as a text string""" + if type_token: + return ' '.join(Id.token if isinstance(Id, TypeToken) else self.type_id_map[Id].token for Id in Ids) + if isinstance(Ids, Tokenization): + Ids = Ids.tokenization + Tokens = [] + for Id in Ids: + Tokens.append(self.text_tokenizer.ids_to_tokens[Id] if Id != -1 else '-1') + Tokens = self.text_tokenizer.convert_ids_to_tokens(Ids) + return ' '.join(Tokens) + + def DecodeTokens(self, Tokens, type_token=False): + """converts wordpiece tokens to a text string""" + if type_token: + return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens) + if isinstance(Tokens, Tokenization): + Tokens = Tokens.tokenization + return ' '.join(Tokens) diff --git a/data_utils/wordpiece.py b/data_utils/wordpiece.py new file mode 100755 index 0000000..81121e4 --- /dev/null +++ b/data_utils/wordpiece.py @@ -0,0 +1,390 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'bert-base-uncased': 512, + 'bert-large-uncased': 512, + 'bert-base-cased': 512, + 'bert-large-cased': 512, + 'bert-base-multilingual-uncased': 512, + 'bert-base-multilingual-cased': 512, + 'bert-base-chinese': 512, +} +VOCAB_NAME = 'vocab.txt' + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding="utf-8") as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + + def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): + """Constructs a BertTokenizer. + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input + Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + """ + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.max_len = max_len if max_len is not None else int(1e12) + + def tokenize(self, text): + if self.do_basic_tokenize: + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this BERT model ({} > {}). Running this" + " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + vocab_file = pretrained_model_name_or_path + if os.path.isdir(vocab_file): + vocab_file = os.path.join(vocab_file, VOCAB_NAME) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) + return tokenizer + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, + do_lower_case=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in self.never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + if text in self.never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/fp16/__init__.py b/fp16/__init__.py new file mode 100644 index 0000000..a2c68a1 --- /dev/null +++ b/fp16/__init__.py @@ -0,0 +1,30 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .fp16util import ( + BN_convert_float, + network_to_half, + prep_param_lists, + model_grads_to_master_grads, + master_params_to_model_params, + tofp16, + to_python_float, + clip_grad_norm, + convert_module, + convert_network, + FP16Model, +) + +from .fp16 import * +from .loss_scaler import * diff --git a/fp16/fp16.py b/fp16/fp16.py new file mode 100755 index 0000000..f292de0 --- /dev/null +++ b/fp16/fp16.py @@ -0,0 +1,627 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Stable version of apex FP16 Optimizer""" +import torch +from torch import nn +from torch.autograd import Variable +from torch.nn.parameter import Parameter +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + +from .loss_scaler import DynamicLossScaler, LossScaler +from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm + +FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) +HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) + +def conversion_helper(val, conversion): + """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure.""" + if not isinstance(val, (tuple, list)): + return conversion(val) + rtn = [conversion_helper(v, conversion) for v in val] + if isinstance(val, tuple): + rtn = tuple(rtn) + return rtn + +def fp32_to_fp16(val): + """Convert fp32 `val` to fp16""" + def half_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, FLOAT_TYPES): + val = val.half() + return val + return conversion_helper(val, half_conversion) + +def fp16_to_fp32(val): + """Convert fp16 `val` to fp32""" + def float_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, HALF_TYPES): + val = val.float() + return val + return conversion_helper(val, float_conversion) + +class FP16_Module(nn.Module): + def __init__(self, module): + super(FP16_Module, self).__init__() + self.add_module('module', module.half()) + + def forward(self, *inputs, **kwargs): + return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs)) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + return self.module.state_dict(destination, prefix, keep_vars) + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) + +# TODO: Update overflow check + downscale to use Carl's fused kernel. +class FP16_Optimizer(object): + """ + :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer, + and manage static or dynamic loss scaling and master weights in a manner transparent to the user. + For standard use, only two lines must be changed: creating the :class:`FP16_Optimizer` instance, + and changing the call to ``backward``. + + Example:: + + model = torch.nn.Linear(D_in, D_out).cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + # Name the FP16_Optimizer instance to replace the existing optimizer + # (recommended but not required): + optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) + ... + # loss.backward() becomes: + optimizer.backward(loss) + ... + + Example with dynamic loss scaling:: + + ... + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + # optional arg to control dynamic loss scaling behavior + # dynamic_loss_args={'scale_window' : 500}) + # Usually, dynamic_loss_args is not necessary. + + Args: + init_optimizer (torch.optim.optimizer): Existing optimizer created with the parameters to optimize. Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones. :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`. + static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale gradients computed by the model. Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate. + dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any ``static_loss_scale`` option. + dynamic_loss_args (dict, optional, default=None): Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor. Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor. If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. + verbose (bool, optional, default=True): By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check. If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``. ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. + + ``init_optimizer`` is expected to have been constructed in the ordinary way. + It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be + named to replace ``init_optimizer``, for two reasons: + First, it means that references to the same name + later in the file will not have to change. + Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to + modify ``init_optimizer``. If you do choose a unique name for the new + :class:`FP16_Optimizer` instance, you should only work with this new instance, + because the preexisting optimizer might no longer behave as expected. + + ``init_optimizer`` may be any Pytorch optimizer. + It may contain a mixture of fp16 and fp32 parameters organized into any number of + ``param_groups`` with different hyperparameters. The :class:`FP16_Optimizer` constructor will + ingest these ``param_groups`` and remember them. + + Calls to :: + + loss.backward() + + must be replaced with :: + + optimizer.backward(loss) + + because :class:`FP16_Optimizer` requires ownership of the backward pass to implement + loss scaling and copies to master gradients. + + .. note:: + Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients + are downscaled before being applied. This means that adjusting the loss scale, or using + dynamic loss scaling, should not require retuning the learning rate or any other + hyperparameters. + + + **Advanced options** + + **Closures**: :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure. + See docstring for :attr:`step`. + + **Gradient clipping**: Use :attr:`clip_master_grads`. + + **Multiple losses**: If your model accumulates gradients from multiple losses, + this can be made more efficient by supplying ``update_master_grads=False`` + to :attr:`backward`. See docstring for :attr:`backward`. + + **Manually adjusting loss scale**: The current loss scale can be retrieved or set via :: + + print(optimizer.loss_scale) + optimizer.loss_scale = new_loss_scale + + For static loss scaling, manually adjusting the loss scale over time is a reasonable + thing to do. During later epochs, gradients may become smaller, and a + higher loss scale may be required, analogous to scheduling the learning rate. Dynamic loss + scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting + the loss scale is not recommended. + + **Multi_GPU training**: If the wrapped ``init_optimizer`` was created from a model wrapped in + Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer` + should still work as intended. + """ + + def __init__(self, + init_optimizer, + static_loss_scale=1.0, + dynamic_loss_scale=False, + dynamic_loss_args=None, + verbose=False): + if not torch.cuda.is_available: + raise SystemError("Cannot use fp16 without CUDA.") + + self.verbose = verbose + + self.optimizer = init_optimizer + # init_state_dict sets up an alternative way to cast per-param state tensors. + # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary. + # init_state_dict = init_optimizer.state_dict() + + self.fp16_groups = [] + self.fp32_from_fp16_groups = [] + self.fp32_from_fp32_groups = [] + for i, param_group in enumerate(self.optimizer.param_groups): + self.maybe_print("FP16_Optimizer processing param group {}:".format(i)) + fp16_params_this_group = [] + fp32_params_this_group = [] + fp32_from_fp16_params_this_group = [] + for i, param in enumerate(param_group['params']): + if param.requires_grad: + if param.type() == 'torch.cuda.HalfTensor': + self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}" + .format(param.size())) + fp16_params_this_group.append(param) + master_param = param.detach().clone().float() + master_param.requires_grad = True + param_group['params'][i] = master_param + fp32_from_fp16_params_this_group.append(master_param) + # Reset existing state dict key to the new master param. + # We still need to recast per-param state tensors, if any, to FP32. + if param in self.optimizer.state: + self.optimizer.state[master_param] = self.optimizer.state.pop(param) + elif param.type() == 'torch.cuda.FloatTensor': + self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}" + .format(param.size())) + fp32_params_this_group.append(param) + param_group['params'][i] = param + else: + raise TypeError("Wrapped parameters must be either " + "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " + "Received {}".format(param.type())) + + self.fp16_groups.append(fp16_params_this_group) + self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) + self.fp32_from_fp32_groups.append(fp32_params_this_group) + + # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors + self.optimizer.load_state_dict(self.optimizer.state_dict()) + # alternative way to cast per-param state tensors: + # self.optimizer.load_state_dict(init_state_dict) + + if dynamic_loss_scale: + self.dynamic_loss_scale = True + if dynamic_loss_args is not None: + self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) + else: + self.loss_scaler = DynamicLossScaler() + else: + self.dynamic_loss_scale = False + self.loss_scaler = LossScaler(static_loss_scale) + + self.overflow = False + self.first_closure_call_this_step = True + + self.clip_grad_norm = clip_grad_norm + + def maybe_print(self, msg): + if self.verbose: + print(msg) + + def __getstate__(self): + raise RuntimeError("FP16_Optimizer should be serialized using state_dict().") + + def __setstate__(self, state): + raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().") + + def zero_grad(self, set_grads_to_None=False): + """ + Zero fp32 and fp16 parameter grads. + """ + # In principle, only the .grad attributes of the model params need to be zeroed, + # because gradients are copied into the FP32 master params. However, we zero + # all gradients owned by the optimizer, just to be safe: + for group in self.optimizer.param_groups: + for p in group['params']: + if set_grads_to_None: + p.grad = None + else: + if p.grad is not None: + p.grad.detach_() + p.grad.zero_() + + # Zero fp16 gradients owned by the model: + for fp16_group in self.fp16_groups: + for param in fp16_group: + if set_grads_to_None: + param.grad = None + else: + if param.grad is not None: + param.grad.detach_() # as in torch.optim.optimizer.zero_grad() + param.grad.zero_() + + def _check_overflow(self): + params = [] + for group in self.fp16_groups: + for param in group: + params.append(param) + for group in self.fp32_from_fp32_groups: + for param in group: + params.append(param) + self.overflow = self.loss_scaler.has_overflow(params) + + def _update_scale(self, has_overflow=False): + self.loss_scaler.update_scale(has_overflow) + + def _master_params_to_model_params(self): + for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): + master_params_to_model_params(fp16_group, fp32_from_fp16_group) + + def _model_params_to_master_params(self): + for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): + master_params_to_model_params(fp32_from_fp16_group, fp16_group) + + # To consider: Integrate distributed with this wrapper by registering a hook on each variable + # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream. + def _model_grads_to_master_grads(self): + for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups): + model_grads_to_master_grads(fp16_group, fp32_from_fp16_group) + + def _downscale_master(self): + if self.loss_scale != 1.0: + for group in self.optimizer.param_groups: + for param in group['params']: + if param.grad is not None: + param.grad.data.mul_(1./self.loss_scale) + + def clip_master_grads(self, max_norm, norm_type=2): + """ + Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``. + + Args: + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + + Returns: + Total norm of the current fp32 gradients (viewed as a single vector). + + .. warning:: + Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). + """ + if not self.overflow: + fp32_params = [] + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + fp32_params.append(param) + return self.clip_grad_norm(fp32_params, max_norm, norm_type) + else: + return -1 + + def state_dict(self): + """ + Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. + This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict + of the contained Pytorch optimizer. + Example:: + + checkpoint = {} + checkpoint['model'] = model.state_dict() + checkpoint['optimizer'] = optimizer.state_dict() + torch.save(checkpoint, "saved.pth") + """ + state_dict = {} + state_dict['loss_scaler'] = self.loss_scaler + state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale + state_dict['overflow'] = self.overflow + state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step + state_dict['optimizer_state_dict'] = self.optimizer.state_dict() + state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups + return state_dict + + def load_state_dict(self, state_dict): + """ + Loads a state_dict created by an earlier call to state_dict(). + If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, + whose parameters in turn came from ``model``, it is expected that the user + will call ``model.load_state_dict()`` before + ``fp16_optimizer_instance.load_state_dict()`` is called. + + Example:: + + model = torch.nn.Linear(D_in, D_out).cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) + ... + checkpoint = torch.load("saved.pth") + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + """ + # I think it should actually be ok to reload the optimizer before the model. + self.loss_scaler = state_dict['loss_scaler'] + self.dynamic_loss_scale = state_dict['dynamic_loss_scale'] + self.overflow = state_dict['overflow'] + self.first_closure_call_this_step = state_dict['first_closure_call_this_step'] + self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) + # At this point, the optimizer's references to the model's fp32 parameters are up to date. + # The optimizer's hyperparameters and internal buffers are also up to date. + # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still + # out of date. There are two options. + # 1: Refresh the master params from the model's fp16 params. + # This requires less storage but incurs precision loss. + # 2: Save and restore the fp32 master copies separately. + # We choose option 2. + # + # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device + # of their associated parameters, because it's possible those buffers might not exist yet in + # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been + # constructed in the same way as the one whose state_dict we are loading, the same master params + # are guaranteed to exist, so we can just copy_() from the saved master params. + for current_group, saved_group in zip(self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']): + for current, saved in zip(current_group, saved_group): + current.data.copy_(saved.data) + + def step(self, closure=None): # could add clip option. + """ + If no closure is supplied, :attr:`step` should be called after + ``fp16_optimizer_obj.backward(loss)``. + :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to + :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params + originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run + another forward pass using their model. + + If a closure is supplied, :attr:`step` may be called without a prior call to + :attr:`backward(loss)`. + This control flow is identical to `ordinary Pytorch optimizer use`_ with closures. + However, the user should take care that any ``loss.backward()`` call within the closure + has been replaced by ``fp16_optimizer_obj.backward(loss)``. + + Args: + closure (optional): Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor. closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. + + Example with closure:: + + # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an + # existing pytorch optimizer. + for input, target in dataset: + def closure(): + optimizer.zero_grad() + output = model(input) + loss = loss_fn(output, target) + # loss.backward() becomes: + optimizer.backward(loss) + return loss + optimizer.step(closure) + + .. warning:: + Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling. + + .. _`ordinary Pytorch optimizer use`: + http://pytorch.org/docs/master/optim.html#optimizer-step-closure + """ + + scale = self.loss_scaler.loss_scale + self._update_scale(self.overflow) + + if self.overflow: + self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}" + .format(scale, self.loss_scale)) + return + + if closure is not None: + retval = self._step_with_closure(closure) + else: + retval = self.optimizer.step() + + self._master_params_to_model_params() + + return retval + + def _step_with_closure(self, closure): + def wrapped_closure(): + # helpful for debugging + # print("Calling wrapped_closure, first_closure_call_this_step = {}" + # .format(self.first_closure_call_this_step)) + if self.first_closure_call_this_step: + # We expect that the fp16 params are initially fresh on entering self.step(), + # so _master_params_to_model_params() is unnecessary the first time wrapped_closure() + # is called within self.optimizer.step(). + self.first_closure_call_this_step = False + else: + # If self.optimizer.step() internally calls wrapped_closure more than once, + # it may update the fp32 params after each call. However, self.optimizer + # doesn't know about the fp16 params at all. If the fp32 params get updated, + # we can't rely on self.optimizer to refresh the fp16 params. We need + # to handle that manually: + self._master_params_to_model_params() + # Our API expects the user to give us ownership of the backward() call by + # replacing all calls to loss.backward() with optimizer.backward(loss). + # This requirement holds whether or not the call to backward() is made within a closure. + # If the user is properly calling optimizer.backward(loss) within "closure," + # calling closure() here will give the fp32 master params fresh gradients + # for the optimizer to play with, so all wrapped_closure needs to do is call + # closure() and return the loss. + temp_loss = closure() + while(self.overflow): + scale = self.loss_scaler.loss_scale + self._update_scale(self.overflow) + self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, " + "reducing to {}".format(scale, self.loss_scale)) + temp_loss = closure() + return temp_loss + + retval = self.optimizer.step(wrapped_closure) + + self.first_closure_call_this_step = True + + return retval + + def backward(self, loss, update_master_grads=True, retain_graph=False): + """ + :attr:`backward` performs the following conceptual steps: + + 1. fp32_loss = loss.float() (see first Note below) + 2. scaled_loss = fp32_loss*loss_scale + 3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). + 4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. + 5. Finally, master grads are divided by loss_scale. + + In this way, after :attr:`backward`, the master params have fresh gradients, + and :attr:`step` may be called. + + .. note:: + :attr:`backward` internally converts the loss to fp32 before applying the loss scale. + This provides some additional safety against overflow if the user has supplied an + fp16 loss value. + However, for maximum overflow safety, the user should + compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to + :attr:`backward`. + + .. warning:: + The gradients found in a model's leaves after the call to + :attr:`backward` should not be regarded as valid in general, + because it's possible + they have been scaled (and in the case of dynamic loss scaling, + the scale factor may change over time). + If the user wants to inspect gradients after a call to :attr:`backward`, + only the master gradients should be regarded as valid. These can be retrieved via + :attr:`inspect_master_grad_data()`. + + Args: + loss: The loss output by the user's model. loss may be either float or half (but see first Note above). + update_master_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. + retain_graph (bool, optional, default=False): Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``. If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). + + Example:: + + # Ordinary operation: + optimizer.backward(loss) + + # Naive operation with multiple losses (technically valid, but less efficient): + # fp32 grads will be correct after the second call, but + # the first call incurs an unnecessary fp16->fp32 grad copy. + optimizer.backward(loss1) + optimizer.backward(loss2) + + # More efficient way to handle multiple losses: + # The fp16->fp32 grad copy is delayed until fp16 grads from all + # losses have been accumulated. + optimizer.backward(loss1, update_master_grads=False) + optimizer.backward(loss2, update_master_grads=False) + optimizer.update_master_grads() + """ + # To consider: try multiple backward passes using retain_grad=True to find + # a loss scale that works. After you find a loss scale that works, do a final dummy + # backward pass with retain_graph=False to tear down the graph. Doing this would avoid + # discarding the iteration, but probably wouldn't improve overall efficiency. + self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) + if update_master_grads: + self.update_master_grads() + + def update_master_grads(self): + """ + Copy the ``.grad`` attribute from stored references to fp16 parameters to + the ``.grad`` attribute of the fp32 master parameters that are directly + updated by the optimizer. :attr:`update_master_grads` only needs to be called if + ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``. + """ + if self.dynamic_loss_scale: + self._check_overflow() + if self.overflow: return + self._model_grads_to_master_grads() + self._downscale_master() + + def inspect_master_grad_data(self): + """ + When running with :class:`FP16_Optimizer`, + ``.grad`` attributes of a model's fp16 leaves should not be + regarded as truthful, because they might be scaled. + After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered, + the fp32 master params' ``.grad`` + attributes will contain valid gradients properly divided by the loss scale. However, + because :class:`FP16_Optimizer` flattens some parameters, accessing them may be + nonintuitive. :attr:`inspect_master_grad_data` + allows those gradients to be viewed with shapes corresponding to their associated model leaves. + + Returns: + List of lists (one list for each parameter group). The list for each parameter group + is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group. + """ + if self.overflow: + print("Warning: calling FP16_Optimizer.inspect_master_grad_data while in an overflow state. " + "Gradients are currently invalid (may be inf, nan, or stale). Returning None.") + return None + else: + # The optimizer owns only references to master params. + master_grads_data = [] + for param_group in self.optimizer.param_groups: + master_grads_this_group = [] + for param in param_group['params']: + if param.grad is not None: + master_grads_this_group.append(param.grad.data) + else: + master_grads_this_group.append(None) + master_grads_data.append(master_grads_this_group) + return master_grads_data + + + # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale" + def _get_loss_scale(self): + return self.loss_scaler.loss_scale + + def _set_loss_scale(self, value): + self.loss_scaler.cur_scale = value + + loss_scale = property(_get_loss_scale, _set_loss_scale) + + # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state" + def _get_state(self): + return self.optimizer.state + + def _set_state(self, value): + self.optimizer.state = value + + state = property(_get_state, _set_state) + + # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups" + # (for example, to adjust the learning rate) + def _get_param_groups(self): + return self.optimizer.param_groups + + def _set_param_groups(self, value): + self.optimizer.param_groups = value + + param_groups = property(_get_param_groups, _set_param_groups) diff --git a/fp16/fp16util.py b/fp16/fp16util.py new file mode 100644 index 0000000..469ed04 --- /dev/null +++ b/fp16/fp16util.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn as nn +from torch.autograd import Variable +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + + +class tofp16(nn.Module): + """ + Utility module that implements:: + + def forward(self, input): + return input.half() + """ + + def __init__(self): + super(tofp16, self).__init__() + + def forward(self, input): + return input.half() + + +def BN_convert_float(module): + """ + Utility function for network_to_half(). + + Retained for legacy purposes. + """ + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: + module.float() + for child in module.children(): + BN_convert_float(child) + return module + + +def network_to_half(network): + """ + Convert model to half precision in a batchnorm-safe way. + + Retained for legacy purposes. It is recommended to use FP16Model. + """ + return nn.Sequential(tofp16(), BN_convert_float(network.half())) + + +def convert_module(module, dtype): + """ + Converts a module's immediate parameters and buffers to dtype. + """ + for param in module.parameters(recurse=False): + if param is not None: + if param.data.dtype.is_floating_point: + param.data = param.data.to(dtype=dtype) + if param._grad is not None and param._grad.data.dtype.is_floating_point: + param._grad.data = param._grad.data.to(dtype=dtype) + + for buf in module.buffers(recurse=False): + if buf is not None and buf.data.dtype.is_floating_point: + buf.data = buf.data.to(dtype=dtype) + + +def convert_network(network, dtype): + """ + Converts a network's parameters and buffers to dtype. + """ + for module in network.modules(): + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: + continue + convert_module(module, dtype) + return network + + +class FP16Model(nn.Module): + """ + Convert model to half precision in a batchnorm-safe way. + """ + + def __init__(self, network): + super(FP16Model, self).__init__() + self.network = convert_network(network, dtype=torch.half) + + def forward(self, *inputs): + inputs = tuple(t.half() for t in inputs) + return self.network(*inputs) + + +def backwards_debug_hook(grad): + raise RuntimeError("master_params recieved a gradient in the backward pass!") + +def prep_param_lists(model, flat_master=False): + """ + Creates a list of FP32 master parameters for a given model, as in + `Training Neural Networks with Mixed Precision: Real Examples`_. + + Args: + model (torch.nn.Module): Existing Pytorch model + flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. + Returns: + A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. + + Example:: + + model_params, master_params = prep_param_lists(model) + + .. warning:: + Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. + + .. _`Training Neural Networks with Mixed Precision: Real Examples`: + http://on-demand.gputechconf.com/gtc/2018/video/S81012/ + """ + model_params = [param for param in model.parameters() if param.requires_grad] + + if flat_master: + # Give the user some more useful error messages + try: + # flatten_dense_tensors returns a contiguous flat array. + # http://pytorch.org/docs/master/_modules/torch/_utils.html + master_params = _flatten_dense_tensors([param.data for param in model_params]).float() + except: + print("Error in prep_param_lists: model may contain a mixture of parameters " + "of different types. Use flat_master=False, or use F16_Optimizer.") + raise + master_params = torch.nn.Parameter(master_params) + master_params.requires_grad = True + # master_params.register_hook(backwards_debug_hook) + if master_params.grad is None: + master_params.grad = master_params.new(*master_params.size()) + return model_params, [master_params] + else: + master_params = [param.clone().float().detach() for param in model_params] + for param in master_params: + param.requires_grad = True + return model_params, master_params + + +def model_grads_to_master_grads(model_params, master_params, flat_master=False): + """ + Copy model gradients to master gradients. + + Args: + model_params: List of model parameters created by :func:`prep_param_lists`. + master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. + """ + if flat_master: + # The flattening may incur one more deep copy than is necessary. + master_params[0].grad.data.copy_( + _flatten_dense_tensors([p.grad.data for p in model_params])) + else: + for model, master in zip(model_params, master_params): + if model.grad is not None: + if master.grad is None: + master.grad = Variable(master.data.new(*master.data.size())) + master.grad.data.copy_(model.grad.data) + else: + master.grad = None + + +def master_params_to_model_params(model_params, master_params, flat_master=False): + """ + Copy master parameters to model parameters. + + Args: + model_params: List of model parameters created by :func:`prep_param_lists`. + master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. + """ + if flat_master: + for model, master in zip(model_params, + _unflatten_dense_tensors(master_params[0].data, model_params)): + model.data.copy_(master) + else: + for model, master in zip(model_params, master_params): + model.data.copy_(master.data) + +# Backward compatibility fixes + +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + +TORCH_MAJOR = int(torch.__version__.split('.')[0]) +TORCH_MINOR = int(torch.__version__.split('.')[1]) +if TORCH_MAJOR == 0 and TORCH_MINOR <= 4: + clip_grad_norm = torch.nn.utils.clip_grad_norm +else: + clip_grad_norm = torch.nn.utils.clip_grad_norm_ \ No newline at end of file diff --git a/fp16/loss_scaler.py b/fp16/loss_scaler.py new file mode 100755 index 0000000..a9f8e0f --- /dev/null +++ b/fp16/loss_scaler.py @@ -0,0 +1,223 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +# item() is a recent addition, so this helps with backward compatibility. +def to_python_float(t): + if hasattr(t, 'item'): + return t.item() + else: + return t[0] + +class LossScaler: + """ + Class that manages a static loss scale. This class is intended to interact with + :class:`FP16_Optimizer`, and should not be directly manipulated by the user. + + Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to + :class:`FP16_Optimizer`'s constructor. + + Args: + scale (float, optional, default=1.0): The loss scale. + """ + + def __init__(self, scale=1): + self.cur_scale = scale + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + return False + + def update_scale(self, overflow): + pass + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss*self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + +class DynamicLossScaler: + """ + Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler` + indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of + :class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler` + operates, because the default options can be changed using the + the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor. + + Loss scaling is designed to combat the problem of underflowing gradients encountered at long + times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss + scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are + encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has + occurred. + :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, + and :class:`DynamicLossScaler` adjusts the loss scale to a lower value. + If a certain number of iterations occur without overflowing gradients detected, + :class:`DynamicLossScaler` increases the loss scale once more. + In this way :class:`DynamicLossScaler` attempts to "ride the edge" of + always using the highest loss scale possible without incurring overflow. + + Args: + init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.` + scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``. + scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale. + """ + + def __init__(self, + init_scale=2**32, + scale_factor=2., + scale_window=1000, + min_scale=1, + delayed_shift=1, + consecutive_hysteresis=False): + self.cur_scale = init_scale + self.cur_iter = 0 + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + self.min_scale = min_scale + self.delayed_shift = delayed_shift + self.cur_hysteresis = delayed_shift + self.consecutive_hysteresis = consecutive_hysteresis + + # `params` is a list / generator of torch.Variable + def has_overflow(self, params): + for p in params: + if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): + return True + + return False + + # `x` is a torch.Tensor + def _has_inf_or_nan(x): + try: + # if x is half, the .float() incurs an additional deep copy, but it's necessary if + # Pytorch's .sum() creates a one-element tensor of the same type as x + # (which is true for some recent version of pytorch). + cpu_sum = float(x.float().sum()) + # More efficient version that can be used if .sum() returns a Python scalar + # cpu_sum = float(x.sum()) + except RuntimeError as instance: + # We want to check if inst is actually an overflow exception. + # RuntimeError could come from a different error. + # If so, we still want the exception to propagate. + if "value cannot be converted" not in instance.args[0]: + raise + return True + else: + if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: + return True + return False + + # `overflow` is boolean indicating whether the gradient overflowed + def update_scale(self, overflow): + if not hasattr(self, 'min_scale'): + self.min_scale = 1 + if not hasattr(self, 'delayed_shift'): + self.delayed_shift = 1 + if not hasattr(self, 'cur_hysteresis'): + self.cur_hysteresis = 1 + if not hasattr(self, 'consecutive_hysteresis'): + self.consecutive_hysteresis = True + if overflow: + # self.cur_scale /= self.scale_factor + if self.delayed_shift == 1 or self.cur_hysteresis == 1: + self.cur_scale = max(self.cur_scale/self.scale_factor, self.min_scale) + else: + self.cur_hysteresis -= 1 + self.last_overflow_iter = self.cur_iter + else: + if self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: + if not self.consecutive_hysteresis: + self.cur_hysteresis = self.delayed_shift + self.cur_scale *= self.scale_factor + self.cur_iter += 1 + + @property + def loss_scale(self): + return self.cur_scale + + def scale_gradient(self, module, grad_in, grad_out): + return tuple(self.loss_scale * g for g in grad_in) + + def backward(self, loss, retain_graph=False): + scaled_loss = loss*self.loss_scale + scaled_loss.backward(retain_graph=retain_graph) + +############################################################## +# Example usage below here -- assuming it's in a separate file +############################################################## +""" +TO-DO separate out into an example. +if __name__ == "__main__": + import torch + from torch.autograd import Variable + from dynamic_loss_scaler import DynamicLossScaler + + # N is batch size; D_in is input dimension; + # H is hidden dimension; D_out is output dimension. + N, D_in, H, D_out = 64, 1000, 100, 10 + + # Create random Tensors to hold inputs and outputs, and wrap them in Variables. + x = Variable(torch.randn(N, D_in), requires_grad=False) + y = Variable(torch.randn(N, D_out), requires_grad=False) + + w1 = Variable(torch.randn(D_in, H), requires_grad=True) + w2 = Variable(torch.randn(H, D_out), requires_grad=True) + parameters = [w1, w2] + + learning_rate = 1e-6 + optimizer = torch.optim.SGD(parameters, lr=learning_rate) + loss_scaler = DynamicLossScaler() + + for t in range(500): + y_pred = x.mm(w1).clamp(min=0).mm(w2) + loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale + print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) + print('Iter {} scaled loss: {}'.format(t, loss.data[0])) + print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) + + # Run backprop + optimizer.zero_grad() + loss.backward() + + # Check for overflow + has_overflow = DynamicLossScaler.has_overflow(parameters) + + # If no overflow, unscale grad and update as usual + if not has_overflow: + for param in parameters: + param.grad.data.mul_(1. / loss_scaler.loss_scale) + optimizer.step() + # Otherwise, don't do anything -- ie, skip iteration + else: + print('OVERFLOW!') + + # Update loss scale for next iteration + loss_scaler.update_scale(has_overflow) + +""" diff --git a/learning_rates.py b/learning_rates.py new file mode 100644 index 0000000..9d9f7ed --- /dev/null +++ b/learning_rates.py @@ -0,0 +1,74 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DataLoader for TFRecords""" + +from torch.optim.lr_scheduler import _LRScheduler +import math + +class AnnealingLR(_LRScheduler): + """Anneals the learning rate from start to zero along a cosine curve.""" + + DECAY_STYLES = ['linear', 'cosine', 'exponential', 'constant', 'None'] + + def __init__(self, optimizer, start_lr, warmup_iter, num_iters, decay_style=None, last_iter=-1): + self.optimizer = optimizer + self.start_lr = start_lr + self.warmup_iter = warmup_iter + self.num_iters = last_iter + 1 + self.end_iter = num_iters + self.decay_style = decay_style.lower() if isinstance(decay_style, str) else None + self.step(self.num_iters) + print('learning rate decaying', decay_style) + + def get_lr(self): + # https://openreview.net/pdf?id=BJYwwY9ll pg. 4 + if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: + return float(self.start_lr) * self.num_iters / self.warmup_iter + else: + if self.decay_style == self.DECAY_STYLES[0]: + return self.start_lr*((self.end_iter-(self.num_iters-self.warmup_iter))/self.end_iter) + elif self.decay_style == self.DECAY_STYLES[1]: + return self.start_lr / 2.0 * (math.cos(math.pi * (self.num_iters - self.warmup_iter) / self.end_iter) + 1) + elif self.decay_style == self.DECAY_STYLES[2]: + #TODO: implement exponential decay + return self.start_lr + else: + return self.start_lr + + def step(self, step_num=None): + if step_num is None: + step_num = self.num_iters + 1 + self.num_iters = step_num + new_lr = self.get_lr() + for group in self.optimizer.param_groups: + group['lr'] = new_lr + + def state_dict(self): + sd = { + 'start_lr': self.start_lr, + 'warmup_iter': self.warmup_iter, + 'num_iters': self.num_iters, + 'decay_style': self.decay_style, + 'end_iter': self.end_iter + } + return sd + + def load_state_dict(self, sd): + self.start_lr = sd['start_lr'] + self.warmup_iter = sd['warmup_iter'] + self.num_iters = sd['num_iters'] + self.end_iter = sd['end_iter'] + self.decay_style = sd['decay_style'] + self.step(self.num_iters) diff --git a/model/__init__.py b/model/__init__.py new file mode 100755 index 0000000..62d5b21 --- /dev/null +++ b/model/__init__.py @@ -0,0 +1,17 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .distributed import * +from .model import * diff --git a/model/distributed.py b/model/distributed.py new file mode 100755 index 0000000..d08c1e9 --- /dev/null +++ b/model/distributed.py @@ -0,0 +1,108 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +import torch.distributed as dist +from torch.nn.modules import Module +from torch.autograd import Variable + + +class DistributedDataParallel(Module): + + def __init__(self, module): + super(DistributedDataParallel, self).__init__() + self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False + + self.module = module + + for p in self.module.parameters(): + if torch.is_tensor(p): + dist.broadcast(p, 0) + + def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): + if(self.needs_reduction): + self.needs_reduction = False + buckets = {} + for name, param in self.module.named_parameters(): + if param.requires_grad and param.grad is not None: + tp = (param.data.type()) + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(param) + if self.warn_on_half: + if torch.cuda.HalfTensor in buckets: + print("WARNING: gloo dist backend for half parameters may be extremely slow." + + " It is recommended to use the NCCL backend in this case.") + self.warn_on_half = False + for tp in buckets: + bucket = buckets[tp] + grads = [param.grad.data for param in bucket] + coalesced = _flatten_dense_tensors(grads) + if fp32_allreduce: + coalesced = coalesced.float() + if not no_scale and not reduce_after: + coalesced /= dist.get_world_size() + dist.all_reduce(coalesced) + torch.cuda.synchronize() + if not no_scale and reduce_after: + coalesced /= dist.get_world_size() + for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) + self.hook_handles = [] + self.hooks = [] + for param in list(self.module.parameters()): + def allreduce_hook(*unused): + Variable._execution_engine.queue_callback(allreduce_params) + # handle = param.register_hook(allreduce_hook) + #self.hooks.append(allreduce_hook) + #self.hook_handles.append(handle) + self.allreduce_params = allreduce_params + + def forward(self, *inputs, **kwargs): + self.needs_reduction = True + return self.module(*inputs, **kwargs) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + #[h.remove() for h in self.hook_handles] + sd = self.module.state_dict(destination, prefix, keep_vars) + # for handle, hook in zip(self.hook_handles, self.hooks): + # d = handle.hooks_dict_ref() + # d[handle.id] = hook + + return sd + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) + + ''' + def _sync_buffers(self): + buffers = list(self.module._all_buffers()) + if len(buffers) > 0: + # cross-node buffer sync + flat_buffers = _flatten_dense_tensors(buffers) + dist.broadcast(flat_buffers, 0) + for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): + buf.copy_(synced) + def train(self, mode=True): + # Clear NCCL communicator and CUDA event cache of the default group ID, + # These cache will be recreated at the later call. This is currently a + # work-around for a potential NCCL deadlock. + if dist._backend == dist.dist_backend.NCCL: + dist._clear_group_cache() + super(DistributedDataParallel, self).train(mode) + self.module.train(mode) + ''' + diff --git a/model/model.py b/model/model.py new file mode 100755 index 0000000..eaf00a3 --- /dev/null +++ b/model/model.py @@ -0,0 +1,88 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for wrapping BertModel.""" + +import torch + +from .modeling import BertConfig +from .modeling import BertForPreTraining +from .modeling import BertLayerNorm + + +def get_params_for_weight_decay_optimization(module): + + weight_decay_params = {'params': []} + no_weight_decay_params = {'params': [], 'weight_decay': 0} + for module_ in module.modules(): + if isinstance(module_, (BertLayerNorm, torch.nn.LayerNorm)): + no_weight_decay_params['params'].extend( + [p for p in list(module_._parameters.values()) + if p is not None]) + else: + weight_decay_params['params'].extend( + [p for n, p in list(module_._parameters.items()) + if p is not None and n != 'bias']) + no_weight_decay_params['params'].extend( + [p for n, p in list(module_._parameters.items()) + if p is not None and n == 'bias']) + + return weight_decay_params, no_weight_decay_params + + +class BertModel(torch.nn.Module): + + def __init__(self, tokenizer, args): + super(BertModel, self).__init__() + if args.pretrained_bert: + self.model = BertForPreTraining.from_pretrained( + args.tokenizer_model_type, + cache_dir=args.cache_dir, + fp32_layernorm=args.fp32_layernorm, + fp32_embedding=args.fp32_embedding, + layernorm_epsilon=args.layernorm_epsilon) + else: + if args.intermediate_size is None: + intermediate_size = 4 * args.hidden_size + else: + intermediate_size = args.intermediate_size + self.config = BertConfig( + tokenizer.num_tokens, + hidden_size=args.hidden_size, + num_hidden_layers=args.num_layers, + num_attention_heads=args.num_attention_heads, + intermediate_size=intermediate_size, + hidden_dropout_prob=args.hidden_dropout, + attention_probs_dropout_prob=args.attention_dropout, + max_position_embeddings=args.max_position_embeddings, + type_vocab_size=tokenizer.num_type_tokens, + fp32_layernorm=args.fp32_layernorm, + fp32_embedding=args.fp32_embedding, + fp32_tokentypes=args.fp32_tokentypes, + layernorm_epsilon=args.layernorm_epsilon) + self.model = BertForPreTraining(self.config) + + def forward(self, input_tokens, token_type_ids=None, + attention_mask=None, checkpoint_activations=False): + return self.model( + input_tokens, token_type_ids, attention_mask, + checkpoint_activations=checkpoint_activations) + + def state_dict(self, destination=None, prefix='', keep_vars=False): + return self.model.state_dict(destination=destination, prefix=prefix, + keep_vars=keep_vars) + + def load_state_dict(self, state_dict, strict=True): + return self.model.load_state_dict(state_dict, strict=strict) diff --git a/model/modeling.py b/model/modeling.py new file mode 100644 index 0000000..c78fc36 --- /dev/null +++ b/model/modeling.py @@ -0,0 +1,1314 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import copy +import json +import math +import logging +import tarfile +import tempfile +import shutil + +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss + +from torch.utils.checkpoint import checkpoint + +from data_utils.file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", +} +CONFIG_NAME = 'bert_config.json' +WEIGHTS_NAME = 'pytorch_model.bin' +TF_WEIGHTS_NAME = 'model.ckpt' + +def load_tf_weights_in_bert(model, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + print("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m"] for n in name): + print("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + +class BertConfig(object): + """Configuration class to store the configuration of a `BertModel`. + """ + def __init__(self, + vocab_size_or_config_json_file, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + fp32_layernorm=False, + fp32_embedding=False, + fp32_tokentypes=False, + layernorm_epsilon=1e-12): + """Constructs BertConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + if isinstance(vocab_size_or_config_json_file, str): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.fp32_layernorm = fp32_layernorm + self.fp32_embedding = fp32_embedding + self.layernorm_epsilon = layernorm_epsilon + self.fp32_tokentypes = fp32_tokentypes + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + +# try: +# from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm +# except ImportError: +# print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") +# class BertLayerNorm(nn.Module): +# def __init__(self, hidden_size, eps=1e-12): +# """Construct a layernorm module in the TF style (epsilon inside the square root). +# """ +# super(BertLayerNorm, self).__init__() +# self.weight = nn.Parameter(torch.ones(hidden_size)) +# self.bias = nn.Parameter(torch.zeros(hidden_size)) +# self.variance_epsilon = eps + +# def forward(self, x): +# u = x.mean(-1, keepdim=True) +# s = (x - u).pow(2).mean(-1, keepdim=True) +# x = (x - u) / torch.sqrt(s + self.variance_epsilon) +# return self.weight * x + self.bias + +class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.fp32_layernorm = config.fp32_layernorm + self.fp32_embedding = config.fp32_embedding + self.fp32_tokentypes = config.fp32_tokentypes + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None): + seq_length = input_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + if not self.fp32_tokentypes: + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + if self.fp32_embedding and not self.fp32_layernorm: + embeddings = embeddings.half() + previous_type = embeddings.type() + if self.fp32_layernorm: + embeddings = embeddings.float() + embeddings = self.LayerNorm(embeddings) + if self.fp32_layernorm: + if self.fp32_embedding: + embeddings = embeddings.half() + else: + embeddings = embeddings.type(previous_type) + else: + embeddings = words_embeddings.float() + position_embeddings.float() + token_type_embeddings.float() + if self.fp32_tokentypes and not self.fp32_layernorm: + embeddings = embeddings.half() + previous_type = embeddings.type() + if self.fp32_layernorm: + embeddings = embeddings.float() + embeddings = self.LayerNorm(embeddings) + if self.fp32_layernorm: + if self.fp32_tokentypes: + embeddings = embeddings.half() + else: + embeddings = embeddings.type(previous_type) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask): + + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + previous_type = attention_probs.type() + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.fp32_layernorm = config.fp32_layernorm + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + ln_input = hidden_states + input_tensor + previous_type = ln_input.type() + if self.fp32_layernorm: + ln_input = ln_input.float() + hidden_states = self.LayerNorm(ln_input) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + + def forward(self, input_tensor, attention_mask): + self_output = self.self(input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.fp32_layernorm = config.fp32_layernorm + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + ln_input = hidden_states + input_tensor + previous_type = ln_input.type() + if self.fp32_layernorm: + ln_input = ln_input.float() + hidden_states = self.LayerNorm(ln_input) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask): + attention_output = self.attention(hidden_states, attention_mask) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + layer = BertLayer(config) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + + # def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): + # all_encoder_layers = [] + # for layer_module in self.layer: + # hidden_states = layer_module(hidden_states, attention_mask) + # if output_all_encoded_layers: + # all_encoder_layers.append(hidden_states) + # if not output_all_encoded_layers: + # all_encoder_layers.append(hidden_states) + # return all_encoder_layers + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False): + all_encoder_layers = [] + def custom(start, end): + def custom_forward(*inputs): + layers = self.layer[start:end] + x_ = inputs[0] + for layer in layers: + x_ = layer(x_, inputs[1]) + return x_ + return custom_forward + + if checkpoint_activations: + l = 0 + num_layers = len(self.layer) + chunk_length = math.ceil(math.sqrt(num_layers)) + while l < num_layers: + hidden_states = checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1) + l += chunk_length + # decoder layers + else: + for i,layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states, attention_mask) + + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + + if not output_all_encoded_layers or checkpoint_activations: + all_encoder_layers.append(hidden_states) + return all_encoder_layers + + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.fp32_layernorm = config.fp32_layernorm + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + previous_type = hidden_states.type() + if self.fp32_layernorm: + hidden_states = hidden_states.float() + hidden_states = self.LayerNorm(hidden_states) + if self.fp32_layernorm: + hidden_states = hidden_states.type(previous_type) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(bert_model_embedding_weights.size(1), + bert_model_embedding_weights.size(0), + bias=False) + self.decoder.weight = bert_model_embedding_weights + self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + self.fp32_embedding = config.fp32_embedding + self.fp32_layernorm = config.fp32_layernorm + def convert_to_type(tensor): + if self.fp32_embedding: + return tensor.half() + else: + return tensor + self.type_converter = convert_to_type + self.converted = False + + def forward(self, hidden_states): + if not self.converted: + self.converted = True + if self.fp32_embedding: + self.transform.half() + if self.fp32_layernorm: + self.transform.LayerNorm.float() + hidden_states = self.transform(self.type_converter(hidden_states)) + # hidden_states = self.decoder(hidden_states) + self.bias + hidden_states = F.linear(self.type_converter(hidden_states), self.type_converter(self.decoder.weight), self.type_converter(self.bias)) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + for p in self.seq_relationship.parameters(): + if p is None: + continue + pooled_output = pooled_output.type_as(p) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class PreTrainedBertModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + def __init__(self, config, *inputs, **kwargs): + super(PreTrainedBertModel, self).__init__() + if not isinstance(config, BertConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + self.config = config + + def init_bert_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, + fp32_layernorm=False, fp32_embedding=False, layernorm_epsilon=1e-12, + fp32_tokentypes=False, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] + else: + archive_file = pretrained_model_name + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + except FileNotFoundError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name, + ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + archive_file)) + return None + if resolved_archive_file == archive_file: + logger.info("loading archive file {}".format(archive_file)) + else: + logger.info("loading archive file {} from cache at {}".format( + archive_file, resolved_archive_file)) + tempdir = None + if os.path.isdir(resolved_archive_file): + serialization_dir = resolved_archive_file + else: + # Extract archive to temp dir + tempdir = tempfile.mkdtemp() + logger.info("extracting archive file {} to temp dir {}".format( + resolved_archive_file, tempdir)) + with tarfile.open(resolved_archive_file, 'r:gz') as archive: + archive.extractall(tempdir) + serialization_dir = tempdir + # Load config + config_file = os.path.join(serialization_dir, CONFIG_NAME) + config = BertConfig.from_json_file(config_file) + config.fp32_layernorm = fp32_layernorm + config.fp32_embedding = fp32_embedding + config.layernorm_epsilon = layernorm_epsilon + config.fp32_tokentypes = fp32_tokentypes + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None: + weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) + state_dict = torch.load(weights_path) + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + load(model, prefix='' if hasattr(model, 'bert') else 'bert.') + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if tempdir: + # Clean up temp dir + shutil.rmtree(tempdir) + return model + + +class BertModel(PreTrainedBertModel): + """BERT model ("Bidirectional Embedding Representations from a Transformer"). + + Params: + config: a BertConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. + + Outputs: Tuple of (encoded_layers, pooled_output) + `encoded_layers`: controled by `output_all_encoded_layers` argument: + - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end + of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], + - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding + to the last attention block of shape [batch_size, sequence_length, hidden_size], + `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a + classifier pretrained on top of the hidden state associated to the first character of the + input (`CLF`) to train on the Next-Sentence task (see BERT's paper). + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = modeling.BertModel(config=config) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config): + super(BertModel, self).__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, checkpoint_activations=False): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.encoder.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers, + checkpoint_activations=checkpoint_activations) + sequence_output = encoded_layers[-1] + for p in self.pooler.parameters(): + if p is None: + continue + sequence_output = sequence_output.type_as(p) + break + pooled_output = self.pooler(sequence_output) + if not output_all_encoded_layers or checkpoint_activations: + encoded_layers = encoded_layers[-1] + return encoded_layers, pooled_output + + +class BertForPreTraining(PreTrainedBertModel): + """BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads: + - the masked language modeling head, and + - the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `masked_lm_labels` and `next_sentence_label` are not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `masked_lm_labels` or `next_sentence_label` is `None`: + Outputs a tuple comprising + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForPreTraining(config) + masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config): + super(BertForPreTraining, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False): + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + return total_loss + else: + return prediction_scores, seq_relationship_score + + +class BertForMaskedLM(PreTrainedBertModel): + """BERT model with the masked language modeling head. + This module comprises the BERT model followed by the masked language modeling head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + + Outputs: + if `masked_lm_labels` is not `None`: + Outputs the masked language modeling loss. + if `masked_lm_labels` is `None`: + Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForMaskedLM(config) + masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config): + super(BertForMaskedLM, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, checkpoint_activations=False): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + prediction_scores = self.cls(sequence_output) + + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + return masked_lm_loss + else: + return prediction_scores + + +class BertForNextSentencePrediction(PreTrainedBertModel): + """BERT model with next sentence prediction head. + This module comprises the BERT model followed by the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `next_sentence_label` is not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `next_sentence_label` is `None`: + Outputs the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForNextSentencePrediction(config) + seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config): + super(BertForNextSentencePrediction, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, checkpoint_activations=False): + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + seq_relationship_score = self.cls( pooled_output) + + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + return next_sentence_loss + else: + return seq_relationship_score + + +class BertForSequenceClassification(PreTrainedBertModel): + """BERT model for classification. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForSequenceClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config, num_labels=2): + super(BertForSequenceClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForMultipleChoice(PreTrainedBertModel): + """BERT model for multiple choice tasks. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_choices`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` + and type 1 corresponds to a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_choices]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) + input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) + token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_choices = 2 + + model = BertForMultipleChoice(config, num_choices) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config, num_choices=2): + super(BertForMultipleChoice, self).__init__(config) + self.num_choices = num_choices + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) + _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, self.num_choices) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + return loss + else: + return reshaped_logits + + +class BertForTokenClassification(PreTrainedBertModel): + """BERT model for token-level classification. + This module is composed of the BERT model with a linear layer on top of + the full hidden state of the last layer. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForTokenClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config, num_labels=2): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForQuestionAnswering(PreTrainedBertModel): + """BERT model for Question Answering (span extraction). + This module is composed of the BERT model with a linear layer on top of + the sequence output that computes start_logits and end_logits + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + + Outputs: + if `start_positions` and `end_positions` are not `None`: + Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. + if `start_positions` or `end_positions` is `None`: + Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end + position tokens of shape [batch_size, sequence_length]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForQuestionAnswering(config) + start_logits, end_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config): + super(BertForQuestionAnswering, self).__init__(config) + self.bert = BertModel(config) + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, checkpoint_activations=False): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + else: + return start_logits, end_logits + + + + diff --git a/optim/__init__.py b/optim/__init__.py new file mode 100755 index 0000000..d804972 --- /dev/null +++ b/optim/__init__.py @@ -0,0 +1,26 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from torch.optim import SGD +from torch.optim import Adadelta +from torch.optim import Adagrad +from torch.optim import SparseAdam +from torch.optim import Adamax +from torch.optim import SGD +from torch.optim import Rprop +from torch.optim import RMSprop +from torch.optim import Optimizer +from torch.optim import LBFGS +from .adam import Adam diff --git a/optim/adam.py b/optim/adam.py new file mode 100755 index 0000000..b96c288 --- /dev/null +++ b/optim/adam.py @@ -0,0 +1,124 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import torch +from torch.optim import Optimizer + + +class Adam(Optimizer): + r"""Implements Adam algorithm. + + It has been proposed in `Adam: A Method for Stochastic Optimization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(Adam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(Adam, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + step_size = group['lr']# * math.sqrt(bias_correction2) / bias_correction1 + if group['weight_decay'] != 0: + p.data.add_(-step_size * group['weight_decay'], p.data) + + p.data.addcdiv_(-step_size, exp_avg, denom) + + return loss diff --git a/pretrain_bert.py b/pretrain_bert.py new file mode 100755 index 0000000..3100e78 --- /dev/null +++ b/pretrain_bert.py @@ -0,0 +1,490 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain BERT""" + +import os +import random +import numpy as np +import torch + +from arguments import get_args +from configure_data import configure_data +from fp16 import FP16_Module +from fp16 import FP16_Optimizer +from learning_rates import AnnealingLR +from model import BertModel +from model import get_params_for_weight_decay_optimization +from model import DistributedDataParallel as DDP +from optim import Adam +from utils import Timers +from utils import save_checkpoint +from utils import load_checkpoint + + +def get_model(tokenizer, args): + """Build the model.""" + + print('building BERT model ...') + model = BertModel(tokenizer, args) + print(' > number of parameters: {}'.format( + sum([p.nelement() for p in model.parameters()])), flush=True) + + # GPU allocation. + model.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if args.fp16: + model = FP16_Module(model) + if args.fp32_embedding: + model.module.model.bert.embeddings.word_embeddings.float() + model.module.model.bert.embeddings.position_embeddings.float() + model.module.model.bert.embeddings.token_type_embeddings.float() + if args.fp32_tokentypes: + model.module.model.bert.embeddings.token_type_embeddings.float() + if args.fp32_layernorm: + for name, _module in model.named_modules(): + if 'LayerNorm' in name: + _module.float() + + # Wrap model for distributed training. + if args.world_size > 1: + model = DDP(model) + + return model + + +def get_optimizer(model, args): + """Set up the optimizer.""" + + # Build parameter groups (weight decay and non-decay). + while isinstance(model, (DDP, FP16_Module)): + model = model.module + layers = model.model.bert.encoder.layer + pooler = model.model.bert.pooler + lmheads = model.model.cls.predictions + nspheads = model.model.cls.seq_relationship + embeddings = model.model.bert.embeddings + param_groups = [] + param_groups += list(get_params_for_weight_decay_optimization(layers)) + param_groups += list(get_params_for_weight_decay_optimization(pooler)) + param_groups += list(get_params_for_weight_decay_optimization(nspheads)) + param_groups += list(get_params_for_weight_decay_optimization(embeddings)) + param_groups += list(get_params_for_weight_decay_optimization( + lmheads.transform)) + param_groups[1]['params'].append(lmheads.bias) + + # Use Adam. + optimizer = Adam(param_groups, + lr=args.lr, weight_decay=args.weight_decay) + + # Wrap into fp16 optimizer. + if args.fp16: + optimizer = FP16_Optimizer(optimizer, + static_loss_scale=args.loss_scale, + dynamic_loss_scale=args.dynamic_loss_scale, + dynamic_loss_args={ + 'scale_window': args.loss_scale_window, + 'min_scale':args.min_scale, + 'delayed_shift': args.hysteresis}) + + return optimizer + + +def get_learning_rate_scheduler(optimizer, args): + """Build the learning rate scheduler.""" + + # Add linear learning rate scheduler. + if args.lr_decay_iters is not None: + num_iters = args.lr_decay_iters + else: + num_iters = args.train_iters * args.epochs + init_step = -1 + warmup_iter = args.warmup * num_iters + lr_scheduler = AnnealingLR(optimizer, + start_lr=args.lr, + warmup_iter=warmup_iter, + num_iters=num_iters, + decay_style=args.lr_decay_style, + last_iter=init_step) + + return lr_scheduler + + +def setup_model_and_optimizer(args, tokenizer): + """Setup model and optimizer.""" + + model = get_model(tokenizer, args) + optimizer = get_optimizer(model, args) + lr_scheduler = get_learning_rate_scheduler(optimizer, args) + criterion = torch.nn.CrossEntropyLoss(reduce=False, ignore_index=-1) + + if args.load is not None: + epoch, i, total_iters = load_checkpoint(model, optimizer, + lr_scheduler, args) + if args.resume_dataloader: + args.epoch = epoch + args.mid_epoch_iters = i + args.total_iters = total_iters + + return model, optimizer, lr_scheduler, criterion + + +def get_batch(data): + ''' get_batch subdivides the source data into chunks of + length args.seq_length. If source is equal to the example + output of the data loading example, with a seq_length limit + of 2, we'd get the following two Variables for i = 0: + ┌ a g m s ┐ ┌ b h n t ┐ + └ b h n t ┘ └ c i o u ┘ + Note that despite the name of the function, the subdivison of data is not + done along the batch dimension (i.e. dimension 1), since that was handled + by the data loader. The chunks are along dimension 0, corresponding + to the seq_len dimension in the LSTM. A Variable representing an appropriate + shard reset mask of the same dimensions is also returned. + ''' + tokens = torch.autograd.Variable(data['text'].long()) + types = torch.autograd.Variable(data['types'].long()) + next_sentence = torch.autograd.Variable(data['is_random'].long()) + loss_mask = torch.autograd.Variable(data['mask'].float()) + lm_labels = torch.autograd.Variable(data['mask_labels'].long()) + padding_mask = torch.autograd.Variable(data['pad_mask'].byte()) + # Move to cuda + tokens = tokens.cuda() + types = types.cuda() + next_sentence = next_sentence.cuda() + loss_mask = loss_mask.cuda() + lm_labels = lm_labels.cuda() + padding_mask = padding_mask.cuda() + + return tokens, types, next_sentence, loss_mask, lm_labels, padding_mask + + +def forward_step(data, model, criterion, args): + """Forward step.""" + + # Get the batch. + tokens, types, next_sentence, loss_mask, lm_labels, \ + padding_mask = get_batch(data) + # Forward model. + output, nsp = model(tokens, types, 1-padding_mask, + checkpoint_activations=args.checkpoint_activations) + nsp_loss = criterion(nsp.view(-1, 2).contiguous().float(), + next_sentence.view(-1).contiguous()).mean() + losses = criterion(output.view(-1, args.data_size).contiguous().float(), + lm_labels.contiguous().view(-1).contiguous()) + loss_mask = loss_mask.contiguous() + loss_mask = loss_mask.view(-1) + lm_loss = torch.sum( + losses * loss_mask.view(-1).float()) / loss_mask.sum() + + return lm_loss, nsp_loss + + +def backward_step(optimizer, model, lm_loss, nsp_loss, args): + """Backward step.""" + + # Total loss. + loss = lm_loss + nsp_loss + + # Backward pass. + optimizer.zero_grad() + if args.fp16: + optimizer.backward(loss, update_master_grads=False) + else: + loss.backward() + + # Reduce across processes. + lm_loss_reduced = lm_loss + nsp_loss_reduced = nsp_loss + if args.world_size > 1: + reduced_losses = torch.cat((lm_loss.view(1), nsp_loss.view(1))) + torch.distributed.all_reduce(reduced_losses.data) + reduced_losses.data = reduced_losses.data / args.world_size + model.allreduce_params(reduce_after=False, + fp32_allreduce=args.fp32_allreduce) + lm_loss_reduced = reduced_losses[0] + nsp_loss_reduced = reduced_losses[1] + + # Update master gradients. + if args.fp16: + optimizer.update_master_grads() + + # Clipping gradients helps prevent the exploding gradient. + if args.clip_grad > 0: + if not args.fp16: + torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) + else: + optimizer.clip_master_grads(args.clip_grad) + + return lm_loss_reduced, nsp_loss_reduced + + +def train_step(input_data, model, criterion, optimizer, lr_scheduler, args): + """Single training step.""" + + # Forward model for one step. + lm_loss, nsp_loss = forward_step(input_data, model, criterion, args) + + # Calculate gradients, reduce across processes, and clip. + lm_loss_reduced, nsp_loss_reduced = backward_step(optimizer, model, lm_loss, + nsp_loss, args) + + # Update parameters. + optimizer.step() + + # Update learning rate. + skipped_iter = 0 + if not (args.fp16 and optimizer.overflow): + lr_scheduler.step() + else: + skipped_iter = 1 + + return lm_loss_reduced, nsp_loss_reduced, skipped_iter + + +def train_epoch(epoch, model, optimizer, train_data, + lr_scheduler, criterion, timers, args): + """Train one full epoch.""" + + # Turn on training mode which enables dropout. + model.train() + + # Tracking loss. + total_lm_loss = 0.0 + total_nsp_loss = 0.0 + + # Iterations. + max_iters = args.train_iters + iteration = 0 + skipped_iters = 0 + if args.resume_dataloader: + iteration = args.mid_epoch_iters + args.resume_dataloader = False + + # Data iterator. + data_iterator = iter(train_data) + + timers('interval time').start() + while iteration < max_iters: + + lm_loss, nsp_loss, skipped_iter = train_step(next(data_iterator), + model, + criterion, + optimizer, + lr_scheduler, + args) + skipped_iters += skipped_iter + iteration += 1 + + # Update losses. + total_lm_loss += lm_loss.data.detach().float() + total_nsp_loss += nsp_loss.data.detach().float() + + # Logging. + if iteration % args.log_interval == 0: + learning_rate = optimizer.param_groups[0]['lr'] + avg_nsp_loss = total_nsp_loss.item() / args.log_interval + avg_lm_loss = total_lm_loss.item() / args.log_interval + elapsed_time = timers('interval time').elapsed() + log_string = ' epoch{:2d} |'.format(epoch) + log_string += ' iteration {:8d}/{:8d} |'.format(iteration, + max_iters) + log_string += ' elapsed time per iteration (ms): {:.1f} |'.format( + elapsed_time * 1000.0 / args.log_interval) + log_string += ' learning rate {:.3E} |'.format(learning_rate) + log_string += ' lm loss {:.3E} |'.format(avg_lm_loss) + log_string += ' nsp loss {:.3E} |'.format(avg_nsp_loss) + if args.fp16: + log_string += ' loss scale {:.1f} |'.format( + optimizer.loss_scale) + print(log_string, flush=True) + total_nsp_loss = 0.0 + total_lm_loss = 0.0 + + # Checkpointing + if args.save and args.save_iters and iteration % args.save_iters == 0: + total_iters = args.train_iters * (epoch-1) + iteration + model_suffix = 'model/%d.pt' % (total_iters) + save_checkpoint(model_suffix, epoch, iteration, model, optimizer, + lr_scheduler, args) + + return iteration, skipped_iters + + +def evaluate(data_source, model, criterion, args): + """Evaluation.""" + + # Turn on evaluation mode which disables dropout. + model.eval() + + total_lm_loss = 0 + total_nsp_loss = 0 + max_iters = args.eval_iters + + with torch.no_grad(): + data_iterator = iter(data_source) + iteration = 0 + while iteration < max_iters: + # Forward evaluation. + lm_loss, nsp_loss = forward_step(next(data_iterator), model, + criterion, args) + # Reduce across processes. + if isinstance(model, DDP): + reduced_losses = torch.cat((lm_loss.view(1), nsp_loss.view(1))) + torch.distributed.all_reduce(reduced_losses.data) + reduced_losses.data = reduced_losses.data/args.world_size + lm_loss = reduced_losses[0] + nsp_loss = reduced_losses[1] + + total_lm_loss += lm_loss.data.detach().float().item() + total_nsp_loss += nsp_loss.data.detach().float().item() + iteration += 1 + + # Move model back to the train mode. + model.train() + + total_lm_loss /= max_iters + total_nsp_loss /= max_iters + return total_lm_loss, total_nsp_loss + + +def initialize_distributed(args): + """Initialize torch.distributed.""" + + # Manually set the device ids. + device = args.rank % torch.cuda.device_count() + if args.local_rank is not None: + device = args.local_rank + torch.cuda.set_device(device) + # Call the init process + if args.world_size > 1: + init_method = 'tcp://' + master_ip = os.getenv('MASTER_ADDR', 'localhost') + master_port = os.getenv('MASTER_PORT', '6000') + init_method += master_ip + ':' + master_port + torch.distributed.init_process_group( + backend=args.distributed_backend, + world_size=args.world_size, rank=args.rank, + init_method=init_method) + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + + if seed is not None and seed > 0: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + +def main(): + """Main training program.""" + + print('Pretrain BERT model') + + # Disable CuDNN. + torch.backends.cudnn.enabled = False + + # Timer. + timers = Timers() + + # Arguments. + args = get_args() + + # Pytorch distributed. + initialize_distributed(args) + + # Random seeds for reproducability. + set_random_seed(args.seed) + + # Data stuff. + data_config = configure_data() + data_config.set_defaults(data_set_type='BERT', transpose=False) + (train_data, val_data, test_data), tokenizer = data_config.apply(args) + args.data_size = tokenizer.num_tokens + + # Model, optimizer, and learning rate. + model, optimizer, lr_scheduler, criterion = setup_model_and_optimizer( + args, tokenizer) + + # At any point you can hit Ctrl + C to break out of training early. + try: + total_iters = 0 + skipped_iters = 0 + start_epoch = 1 + best_val_loss = float('inf') + # Resume data loader if necessary. + if args.resume_dataloader: + start_epoch = args.epoch + total_iters = args.total_iters + train_data.batch_sampler.start_iter = total_iters % len(train_data) + # For all epochs. + for epoch in range(start_epoch, args.epochs+1): + timers('epoch time').start() + iteration, skipped = train_epoch(epoch, model, optimizer, + train_data, lr_scheduler, + criterion, timers, args) + elapsed_time = timers('epoch time').elapsed() + total_iters += iteration + skipped_iters += skipped + lm_loss, nsp_loss = evaluate(val_data, model, criterion, args) + val_loss = lm_loss + nsp_loss + print('-' * 100) + print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:.4E} | ' + 'valid LM Loss {:.4E} | valid NSP Loss {:.4E}'.format( + epoch, elapsed_time, val_loss, lm_loss, nsp_loss)) + print('-' * 100) + if val_loss < best_val_loss: + best_val_loss = val_loss + if args.save: + best_path = 'best/model.pt' + print('saving best model to:', + os.path.join(args.save, best_path)) + save_checkpoint(best_path, epoch+1, total_iters, model, + optimizer, lr_scheduler, args) + + + except KeyboardInterrupt: + print('-' * 100) + print('Exiting from training early') + if args.save: + cur_path = 'current/model.pt' + print('saving current model to:', + os.path.join(args.save, cur_path)) + save_checkpoint(cur_path, epoch, total_iters, model, optimizer, + lr_scheduler, args) + exit() + + if args.save: + final_path = 'final/model.pt' + print('saving final model to:', os.path.join(args.save, final_path)) + save_checkpoint(final_path, args.epochs, total_iters, model, optimizer, + lr_scheduler, args) + + if test_data is not None: + # Run on test data. + print('entering test') + lm_loss, nsp_loss = evaluate(test_data, model, criterion, args) + test_loss = lm_loss + nsp_loss + print('=' * 100) + print('| End of training | test loss {:5.4f} | valid LM Loss {:.4E} |' + ' valid NSP Loss {:.4E}'.format(test_loss, lm_loss, nsp_loss)) + print('=' * 100) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b4eb4b4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +nltk>=3.4 +numpy>=1.15.4 +pandas>=0.24.0 +sentencepiece>=0.1.8 +tensorflow>=1.12.0 diff --git a/scripts/pretrain_bert.sh b/scripts/pretrain_bert.sh new file mode 100755 index 0000000..fec4d2e --- /dev/null +++ b/scripts/pretrain_bert.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +RANK=0 +WORLD_SIZE=1 + +python pretrain_bert.py \ + --batch-size 4 \ + --tokenizer-type BertWordPieceTokenizer \ + --cache-dir cache_dir \ + --tokenizer-model-type bert-large-uncased \ + --vocab-size 30522 \ + --train-data wikipedia \ + --loose-json \ + --text-key text \ + --split 1000,1,1 \ + --lazy-loader \ + --max-preds-per-seq 80 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --num-layers 24 \ + --hidden-size 1024 \ + --intermediate-size 4096 \ + --num-attention-heads 16 \ + --hidden-dropout 0.1 \ + --attention-dropout 0.1 \ + --train-iters 1000000 \ + --lr 0.0001 \ + --lr-decay-style linear \ + --lr-decay-iters 990000 \ + --warmup .01 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --fp16 \ + --fp32-layernorm \ + --fp32-embedding \ + --hysteresis 2 \ + --num-workers 2 diff --git a/scripts/pretrain_bert_distributed.sh b/scripts/pretrain_bert_distributed.sh new file mode 100755 index 0000000..781c7d5 --- /dev/null +++ b/scripts/pretrain_bert_distributed.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +WORLD_SIZE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 + +DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.launch $DISTRIBUTED_ARGS \ + pretrain_bert.py \ + --batch-size 4 \ + --tokenizer-type BertWordPieceTokenizer \ + --cache-dir cache_dir \ + --tokenizer-model-type bert-large-uncased \ + --vocab-size 30522 \ + --train-data wikipedia \ + --loose-json \ + --text-key text \ + --split 1000,1,1 \ + --lazy-loader \ + --max-preds-per-seq 80 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --num-layers 24 \ + --hidden-size 1024 \ + --intermediate-size 4096 \ + --num-attention-heads 16 \ + --hidden-dropout 0.1 \ + --attention-dropout 0.1 \ + --train-iters 1000000 \ + --lr 0.0001 \ + --lr-decay-style linear \ + --lr-decay-iters 990000 \ + --warmup .01 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --fp16 \ + --fp32-layernorm \ + --fp32-embedding \ + --hysteresis 2 \ + --num-workers 2 diff --git a/scripts/pretrain_bert_sentencepiece.sh b/scripts/pretrain_bert_sentencepiece.sh new file mode 100755 index 0000000..b659e38 --- /dev/null +++ b/scripts/pretrain_bert_sentencepiece.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +RANK=0 +WORLD_SIZE=1 + +python pretrain_bert.py \ + --batch-size 4 \ + --tokenizer-type SentencePieceTokenizer \ + --tokenizer-model-type bpe \ + --tokenizer-path tokenizer.model \ + --vocab-size 30522 \ + --train-data wikipedia \ + --loose-json \ + --text-key text \ + --split 1000,1,1 \ + --lazy-loader \ + --max-preds-per-seq 80 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --num-layers 24 \ + --hidden-size 1024 \ + --intermediate-size 4096 \ + --num-attention-heads 16 \ + --hidden-dropout 0.1 \ + --attention-dropout 0.1 \ + --train-iters 1000000 \ + --lr 0.0001 \ + --lr-decay-style linear \ + --lr-decay-iters 990000 \ + --warmup .01 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --fp16 \ + --fp32-layernorm \ + --fp32-embedding \ + --hysteresis 2 \ + --num-workers 2 diff --git a/scripts/pretrain_bert_tfrecords_distributed.sh b/scripts/pretrain_bert_tfrecords_distributed.sh new file mode 100755 index 0000000..cb52ba5 --- /dev/null +++ b/scripts/pretrain_bert_tfrecords_distributed.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +WORLD_SIZE=8 +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 + +DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" + +python -m torch.distributed.launch $DISTRIBUTED_ARGS \ + pretrain_bert.py \ + --batch-size 4 \ + --tokenizer-type BertWordPieceTokenizer \ + --cache-dir cache_dir \ + --tokenizer-model-type bert-large-uncased \ + --vocab-size 30522 \ + --use-tfrecords \ + --train-data \ + --valid-data \ + --test-data \ + --max-preds-per-seq 80 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --num-layers 24 \ + --hidden-size 1024 \ + --intermediate-size 4096 \ + --num-attention-heads 16 \ + --hidden-dropout 0.1 \ + --attention-dropout 0.1 \ + --train-iters 1000000 \ + --lr 0.0001 \ + --lr-decay-style linear \ + --lr-decay-iters 990000 \ + --warmup .01 \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --fp16 \ + --fp32-layernorm \ + --fp32-embedding \ + --hysteresis 2 \ + --num-workers 2 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..b9bd689 --- /dev/null +++ b/utils.py @@ -0,0 +1,180 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for logging and serialization""" + +import os +import random +import time +import numpy as np +import torch + + +class Timers: + """Group of timers.""" + + class Timer: + """Timer.""" + + def __init__(self, name): + self.name_ = name + self.elapsed_ = 0.0 + self.started_ = False + self.start_time = time.time() + + def start(self): + """Start the timer.""" + assert not self.started_, 'timer has already been started' + torch.cuda.synchronize() + self.start_time = time.time() + self.started_ = True + + def stop(self): + """Stop the timer.""" + assert self.started_, 'timer is not started' + torch.cuda.synchronize() + self.elapsed_ += (time.time() - self.start_time) + self.started_ = False + + def reset(self): + """Reset timer.""" + self.elapsed_ = 0.0 + self.started_ = False + + def elapsed(self, reset=True): + """Calculate the elapsed time.""" + started_ = self.started_ + # If the timing in progress, end it first. + if self.started_: + self.stop() + # Get the elapsed time. + elapsed_ = self.elapsed_ + # Reset the elapsed time + if reset: + self.reset() + # If timing was in progress, set it back. + if started_: + self.start() + return elapsed_ + + def __init__(self): + self.timers = {} + + def __call__(self, name): + if name not in self.timers: + self.timers[name] = self.Timer(name) + return self.timers[name] + + def log(self, names, normalizer=1.0, reset=True): + """Log a group of timers.""" + assert normalizer > 0.0 + string = 'time (ms)' + for name in names: + elapsed_time = self.timers[name].elapsed( + reset=reset) * 1000.0/ normalizer + string += ' | {}: {:.2f}'.format(name, elapsed_time) + print(string, flush=True) + + +def report_memory(name): + """Simple GPU memory report.""" + + mega_bytes = 1024.0 * 1024.0 + string = name + ' memory (MB)' + string += ' | allocated: {}'.format( + torch.cuda.memory_allocated() / mega_bytes) + string += ' | max allocated: {}'.format( + torch.cuda.max_memory_allocated() / mega_bytes) + string += ' | cached: {}'.format(torch.cuda.memory_cached() / mega_bytes) + string += ' | max cached: {}'.format( + torch.cuda.max_memory_cached()/ mega_bytes) + print(string, flush=True) + + +def load_checkpoint(model, optimizer, lr_scheduler, args): + """Load a model checkpoint.""" + + checkpoint_path = args.load + model_path = checkpoint_path + model_sd = torch.load(model_path, map_location='cpu') + total_iters = model_sd['total_iters'] + epoch = model_sd['epoch'] + i = model_sd['mid_epoch_iters'] + model.load_state_dict(model_sd['sd']) + + checkpoint_path = os.path.dirname(checkpoint_path) + if args.load_optim: + optim_path = os.path.join(checkpoint_path, 'optim.pt') + optim_sd, lr_sd = torch.load(optim_path, map_location='cpu') + optimizer.load_state_dict(optim_sd) + lr_scheduler.load_state_dict(lr_sd) + elif args.fp16: + optimizer._model_params_to_master_params() + + rng_path = None + if args.load_rng: + rng_path = os.path.join(checkpoint_path, 'rng.pt') + if args.load_all_rng: + rng_path = os.path.join(checkpoint_path, + 'rng.%d.pt'%(torch.distributed.get_rank())) + if rng_path is not None: + rng_state = torch.load(rng_path) + torch.cuda.set_rng_state(rng_state[0]) + torch.set_rng_state(rng_state[1]) + np.random.set_state(rng_state[2]) + random.setstate(rng_state[3]) + + return epoch, i, total_iters + + +def save_checkpoint(model_suffix, epoch, i, model, optimizer, lr_scheduler, args): + """Save a model checkpoint.""" + + model_path = os.path.join(args.save, model_suffix) + checkpoint_dir = os.path.dirname(model_path) + rng_state = (torch.cuda.get_rng_state(), + torch.get_rng_state(), + np.random.get_state(), + random.getstate()) + if not (torch.distributed.is_initialized() and \ + torch.distributed.get_rank() > 1): + if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + total_iters = args.train_iters * (epoch-1) + i + sd = {'sd': model.state_dict()} + sd['total_iters'] = total_iters + sd['epoch'] = epoch + sd['mid_epoch_iters'] = i + torch.save(sd, model_path) + print('saved', model_path) + + if args.save_optim: + optim_path = os.path.join(checkpoint_dir, 'optim.pt') + torch.save((optimizer.state_dict(), + lr_scheduler.state_dict()), optim_path) + print('saved', optim_path) + + if args.save_rng: + rng_path = os.path.join(checkpoint_dir, 'rng.pt') + torch.save(rng_state, rng_path) + print('saved', rng_path) + else: + while not os.path.exists(checkpoint_dir): + time.sleep(1) + if args.save_all_rng: + rng_path = os.path.join(checkpoint_dir, + 'rng.%d.pt'%(torch.distributed.get_rank())) + torch.save(rng_state, rng_path) + print('saved', rng_path)