From 6cc385215b0c834dc2076f107293d958c0cdf405 Mon Sep 17 00:00:00 2001
From: Stephane Aroca-Ouellette <stephanearocaouellette@gmail.com>
Date: Sun, 20 Sep 2020 18:01:00 -0600
Subject: [PATCH] Finalize code for public release (with files included...)

---
 LICENSE                        |   2 +-
 README.md                      | 106 ++++++-----------
 arguments.py                   |   2 +-
 convert_state_dict.py          |   5 +
 data_utils/corpora.py          |   2 +-
 data_utils/datasets.py         |   8 +-
 data_utils/make_dataset.py     |  97 +++++++++++-----
 evaluate/config/defaults.conf  |   1 -
 evaluate/config/test_bert.conf |   2 +-
 evaluate/evaluate.py           | 108 +++++++++--------
 idf.py                         |   6 +-
 model/new_models.py            |  10 +-
 paths.py                       |   6 +-
 pretrain_bert.py               |   4 +-
 requirements.txt               | 205 ++++++++++++++++++++++++++++++++-
 scripts/pretrain_bert.sh       |  28 -----
 16 files changed, 382 insertions(+), 210 deletions(-)

diff --git a/LICENSE b/LICENSE
index cb87378..010f781 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-------------- LICENSE FOR huggingface(transformer) repository --------------
+------------- LICENSE FOR StephAO(sentence_encoder) repository --------------
 
 
                                  Apache License
diff --git a/README.md b/README.md
index 932950b..fd9c2ab 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,15 @@
-Combines  
+Code for "On Losses for Modern Language Models" (#TODO link paper)
+
+This repository is primarily for reproducibility and posterity. It is not maintained.
+
+Thank you to NVIDIA and NYU's jiant group for their code which helped create the base of this repo. Specifically
 https://github.com/NVIDIA/Megatron-LM/commits/master (commit 0399d32c75b4719c89b91c18a173d05936112036)  
 and  
 https://github.com/nyu-mll/jiant/commits/master (commit 14d9e3d294b6cb4a29b70325b2b993d5926fe668)  
-to get a BERT repo from pretraining to evaluation.  
-
-To run pretraining on slurm (from outside this directory):  
-`srun --gres=gpu:1 -c 8 --mem=12G -p gpu bash Megatron-LM/scripts/pretrain_bert.sh > output.txt &`  
-
-To run evaluation on slurm (from outside this directory):  
-`srun --gres=gpu:1 -c 8 --mem=12G -p gpu python3 -m Megatron-LM.evaluate.main --config_file test_bert.conf &`
+were used.
 
 # Setup
-We officially support only python3.6.
-
-To use this repo please install the latest supported versions of PyTorch with GPU support. 
-
-Additionally, part of this codebase leverages tensorflow-cpu to perform dataloading of TFRecords. We recommend creating a virtual environment (to avoid breaking existing tf installations) and install our `reuirements.txt`.
+Only tested on python3.6.
 
 ```
 python -m pip install virtualenv
@@ -26,72 +20,42 @@ pip install -r requirements.txt
 
 
 # Usage
-We've provided 4 scripts that pretrain BERT. All saved checkpoints can be used for finetuning according to [existing implementations](https://github.com/huggingface). Save model checkpoints with `--save`.
-
-## BERT Pretraining
-`bash scripts/pretrain_bert.sh`
-
-This script runs single gpu BERT pretraining and is mainly for debugging purposes.
-
-To use this script place your `--train-data` in loose json format with one json per line. The text field of your json dictionaries should correspond to `--text-key`. 
-
-```
-python pretrain_bert.py \
-    --batch-size 4 \
-    --tokenizer-type BertWordPieceTokenizer \
-    --cache-dir temp_cache_dir \
-    --tokenizer-model-type bert-large-uncased \
-    --vocab-size 30522 \
-    --train-data wikipedia \
-    --presplit-sentences \
-    --loose-json \
-    --text-key text \
-    --split 1000,1,1 \
-    --lazy-loader \
-    --max-preds-per-seq 80 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --intermediate-size 4096 \
-    --num-attention-heads 16 \
-    --hidden-dropout 0.1 \
-    --attention-dropout 0.1 \
-    --train-iters 1000000 \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --lr-decay-iters 990000 \
-    --warmup .01 \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --fp32-layernorm \
-    --fp32-embedding \
-    --hysteresis 2 \
-    --num-workers 2 
-```
+The code enables pre-training a transformer (size specified in bert_config.json) using any combination of the following tasks (aka modes/losses):
+"mlm", "nsp", "psp", "sd", "so", "rg", "fs", "tc", "sc", "sbo", "wlen", "cap", "tf", "tf_idf", or "tgs". See paper for details regarding the modes.
+NOTE: PSP (previous sentence prediction) is equivalent to ASP (adjacent sentence prediction) from the paper. RG (referential game) is equivalent to QT (quick thoughts variant) from the paper.
 
-## Distributed BERT Pretraining
-`bash scripts/pretrain_bert_distributed.sh`
+They can be combined using any of the following methods:
+- Summing all losses (default, incompatible between a small subset of tasks, see paper for more detail)
+- Continuous Multi-Task Learning, based on ERNIE 2.0 (--continual-learning True)
+- Alternating between losses (--alternating True)
 
-To use this script, follow the same data preparation procedure as in [earlier sections](#bert-pretraining). This script uses the pytorch distributed launcher to launch distributed training. As such, multinode training can be achieved by properly setting environment variables for the `env://` init method. See the official pytorch [documentation](https://pytorch.org/docs/stable/distributed.html#launch-utility) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default multinode training uses the nccl distributed backend.
+With the following modifiers:
+- Always using MLM loss (--always-mlm True, which is the default and highly recommended, see paper for more details)
+- Incrementally add tasks each epoch (--incremental)
+- Use data formatting for tasks, but zero out losses from auxiliary tasks (--no-aux True, not recommended, used for testing)
 
-## Distributed BERT Pretraining with TFRecords
-`bash scripts/pretrain_bert_tfrecords_distributed.sh`
+Set paths to read/save/load from in paths.py
 
-This script takes advantage of TensorFlow BERT's [`create_pretraining.py`](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/create_pretraining_data.py) script to pre-cache the dataset in the TFRecord format. To convert the data to pytorch tensors we use a `TFRecordDataset` and tensorflow eager mode to turn the TFRecords into numpy matrices before loading them into pytorch gpu tensors. This greatly reduces the overhead of dataprocessing and speeds up training. Pass a whitespace-separated list of TFRecord paths to `--train-data` and enable the `--use-tfrecords` flag. Multinode training can be achieved as described in the [previous section](#distributed-bert-pretraining).
+To create datasets, see data_utils/make_dataset.py
 
-## Train Custom Sentence Piece Tokenizer and Pretrain BERT
-`bash scripts/pretrain_bert_sentencepiece.sh`
+For tf_idf prediction, you need to first calculate the idf score for your dataset. See idf.py for a script to do this.
 
-This script runs BERT pretraining with a `sentencepiece` tokenizer. If no sentencepiece tokenizer exists at `--tokenizer-path` one will be trained automatically. The sentencepiece tokenizer can be used with the previous scripts (NOTE: sentencepiece training can only happen during single gpu pretraining). `<--tokenizer-path>.vocab` can be used with [`create_pretraining_data.py`](https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/create_pretraining_data.py) to make a TFRecord dataset with the given tokenization.
+## Pre-training
+To run pretraining :
+`bash sentence_encoders/scripts/pretrain_bert.sh --model-type [model type]`
+Where model type is the name of the model you want to train. If model type is one of the modes, it will train using mlm and that mode (if model type is mlm, it will train using just mlm).
+The --modes argument will override this default behaviour. If model type is not a specified mode, the--modes argument is required.
 
+## Distributed Pretraining
+Use pretrain_bert_distributed.sh instead.
+`bash sentence_encoders/scripts/pretrain_bert_distributed.sh --model-type [model type]`
 
-# Collecting Wikipedia Training Data
-We recommend following the wikipedia data extraction process specified by google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text." 
+## Evaluation
+To run evaluation:
+You will need to convert the saved state dict of the required model using the convert_state_dict.py file.
+Then run:
+`python3 -m sentence_encoders.evaluate.main --exp_name [experiment name]`
+Where experiment name is the same as the model type above. If using a saved checkpoint instead of the best model, use the --checkpoint argument.
 
-We recommend using the `--json` argument when using WikiExtractor, which will dump the wikipedia data into loose json format (one json per line), making it more manageable and readily consumable by our codebase. We recommend further preprocessing this json dataset by preprocessing the dataset with nltk punctuation standardization, and presplitting each document into newline separated sentences. This can be done with the provided script `./scripts/presplit_sentences_json.py` and will allow for faster data processing during training time. Pretraining with presplit data should be run with the `--presplit-sentences` flag as shown above.
 
-Once the json dataset is ready make sure to set the path in line 27 of `data_utils/corpora.py`.
 
-If your system is memory limited we also recommend running pretraining with the `--lazy-loader` argument as we've done. After preprocessing the dataset once, this will allow the dataset to be lazily loaded from disk, as opposed to storing it in memory.
diff --git a/arguments.py b/arguments.py
index 2532427..9ab8e47 100644
--- a/arguments.py
+++ b/arguments.py
@@ -155,7 +155,7 @@ def add_training_args(parser):
                        const=True, default=False,
                        help='If true, train new and old losses separately.')
     group.add_argument('--always-mlm', type=str2bool, nargs='?',
-                       const=True, default=False,
+                       const=True, default=True,
                        help='If true, train new and old losses separately.')
     group.add_argument('--no-aux', action='store_true',
                        help='If true, zero out all aux loss.')
diff --git a/convert_state_dict.py b/convert_state_dict.py
index 8fe6f5a..f1f9ab8 100644
--- a/convert_state_dict.py
+++ b/convert_state_dict.py
@@ -1,3 +1,8 @@
+"""
+Script use to change the key names of state dicts so that it can be properly loaded in the evaluation code.
+Kind of a hack, but it works...
+"""
+
 import sys
 import torch
 
diff --git a/data_utils/corpora.py b/data_utils/corpora.py
index f7a9bdb..4bc5981 100755
--- a/data_utils/corpora.py
+++ b/data_utils/corpora.py
@@ -42,7 +42,7 @@ class bookcorpus(json_dataset):
 
     command line usage: `--train-data wikipedia`
     """
-    PATH = os.path.join("/h/stephaneao/bookcorpus.lazy")
+    PATH = os.path.join(train_data_path, "bookcorpus.lazy")
     assert_str = "make sure to set PATH at line 27 of data_utils/corpora.py"
     def __init__(self, **kwargs):
         assert bookcorpus.PATH != '<bookcorpus_path>', \
diff --git a/data_utils/datasets.py b/data_utils/datasets.py
index 7b02db5..2b1873f 100644
--- a/data_utils/datasets.py
+++ b/data_utils/datasets.py
@@ -910,11 +910,11 @@ def shuffle_trigrams(self, tokens, token_types, token_labels, i, rng):
         if self.trigram_shuffle_rate == 0:
             return []
 
-        ngram = 2
+        ngram = 3
         # 6 permutations (ngram = 3)
-        #classes = {0: [2, 1, 0], 1: [0, 2, 1], 2: [1, 0, 2], 3: [1, 2, 0], 4: [2, 0, 1], 5: [0, 1, 2]}
+        classes = {0: [2, 1, 0], 1: [0, 2, 1], 2: [1, 0, 2], 3: [1, 2, 0], 4: [2, 0, 1], 5: [0, 1, 2]}
         # 2 permutations (ngram = 2)
-        classes = {0: [1, 0], 1: [0, 1]}
+        #classes = {0: [1, 0], 1: [0, 1]}
         labels = []
         mask = []
         idx = 0
@@ -929,7 +929,7 @@ def shuffle_trigrams(self, tokens, token_types, token_labels, i, rng):
             if valid_seq_len >= ngram and rng.random() < self.trigram_shuffle_rate:
                 valid_seq_len = 0
                 # Shuffle
-                label = rng.randint(0,1)
+                label = rng.randint(0,5)
                 perm = classes[label]
                 tokens[i][idx - (ngram - 1) : idx + 1] = [tokens[i][idx - p] for p in perm]
                 token_types[i][idx - (ngram - 1) : idx + 1] = [token_types[i][idx - p] for p in perm]
diff --git a/data_utils/make_dataset.py b/data_utils/make_dataset.py
index a69a595..0894804 100644
--- a/data_utils/make_dataset.py
+++ b/data_utils/make_dataset.py
@@ -1,3 +1,17 @@
+"""
+Used to create dataset to train on. Processes data into the following format:
+-Large byte file containing cleaned string data where each sentence is on a newline and each document is split by an
+additional newline character.
+-Pickle of list that contains the boundaries of each document to enable lazy and fast read of a random access document.
+
+This script will create the above dataset from existing datasets (useful if existing dataset is not cleaned/filtered or
+if you want to split documents into smaller, more consistently sized chunks) or from text files (either one large
+text file, or a directory containing many text files).
+
+Once created, you need to add the dataset to data_utils/corpora.py. You can then use it through the train-data argument.
+"""
+
+
 from sentence_encoders import data_utils
 from multiprocessing import Pool
 from blingfire import text_to_sentences
@@ -14,11 +28,13 @@
 
 doc_separator = "\n".encode('utf-8')
 
+
 def sentence_tokenize(tokenizer, sent):
     """tokenize sentence and get token types if tokens=True"""
     tokens = tokenizer.EncodeAsIds(sent).tokenization
     return tokens
 
+
 def get_doc_len(s, tokenizer):
     toks = 0
     words = len(s.split(' '))
@@ -31,12 +47,20 @@ def get_doc_len(s, tokenizer):
 
     return doc_len, toks
 
-def process_document(document, max_doc_length, tokenizer=None):
+
+def process_document(document, doc_length, tokenizer=None):
+    """
+    Filter, clean, and split document into doc_length sizes
+    :param document: Document to process
+    :param doc_length: Desired size of document chunks
+    :param tokenizer: Tokenizer to use if available (Bases doc length on token size instead of word size)
+    :return: Bytes chunks to write to file, respective lengths of those chunks, and document statistics.
+    """
     str_lens = []
     writes = []
 
     required_sents = 4
-    
+
     if type(document) == str:
         document = document.split("\n")
 
@@ -50,7 +74,7 @@ def process_document(document, max_doc_length, tokenizer=None):
     if float(len(string_document)) < 0.9 * len(' '.join(document)):
         return [], [], 0, 0, 0, 0
     # Filter documents containing less than 10 words
-    if len(string_document.split(' ')) < 10: 
+    if len(string_document.split(' ')) < 10:
         return [], [], 0, 0, 0, 0
     # Filter documents containing less than 100 characters
     if len(string_document) < 100:
@@ -81,8 +105,8 @@ def process_document(document, max_doc_length, tokenizer=None):
         # Translate some weird utf-8 characters to their more regular counterparts
         s = s.translate(DatasetWriter.transl_table)
         # Remove the rest of the weird utf-8 characters
-        #s = ''.join([chr(c) for c in s.encode('utf-8') if c < 128]) # [9,10,13] + list(range(32,127))])
-        #s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', s)
+        # s = ''.join([chr(c) for c in s.encode('utf-8') if c < 128]) # [9,10,13] + list(range(32,127))])
+        # s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', s)
         s = unidecode.unidecode(s)
         encoded = unicodedata.normalize('NFKD', s).encode('utf-8')  # clean(s)
         doc_bytes += encoded
@@ -94,7 +118,7 @@ def process_document(document, max_doc_length, tokenizer=None):
         num_toks += nt
 
         # Split if we've reached the max doc length and enough sentences left
-        if doc_len >= max_doc_length and num_sents >= required_sents and len(document) - i >= required_sents:
+        if doc_len >= doc_length and num_sents >= required_sents and len(document) - i >= required_sents:
             # Update stats
             doc_str = doc_bytes.decode('utf-8')
             num_words = len(doc_str.split(' '))
@@ -124,24 +148,27 @@ def process_document(document, max_doc_length, tokenizer=None):
         # Append write data
         writes += [doc_bytes + doc_separator]
         str_lens.append(str_cnt + 1)  # + 1 for doc separator
-    
-    print("-"*100)
+
+    print("-" * 100)
     for w in writes:
         print(w.decode('utf-8'))
 
     return writes, str_lens, tok_total, word_total, sentence_total, document_total
 
+
 class DatasetWriter:
+    """ Class that reads in text file or existing datasets and processes them into desired format"""
 
     transl_table = dict([(ord(x), ord(y)) for x, y in zip(u"‘’´“”––-æ", u"'''\"\"---e")])
 
-    def __init__(self, name, read_path, path_ext=None, max_doc_length=1024, preamble_len=100, from_text_files=False, split_on_newlines=False):
+    def __init__(self, name, read_path, path_ext=None, max_doc_length=1024, preamble_len=100, from_text_files=False,
+                 split_on_newlines=False):
         """
         :param name [string]: Name of the dataset
         :param read_path Union[string, List[string]]: If using text files, the base read path to the files, else a list of datasets
         :param path_ext List[string]: List of extensions to the read path if there are multiple subdirectories to get files from
         :param from_text_files [Bool]: Whether to use text files to read data or existing datasets
-        :param split_on_newlines [Bool]: If True, paragraphs are independent documents, if False, tiles are independent documents
+        :param split_on_newlines [Bool]: If True, paragraphs are independent documents, if False, files are independent documents
         """
         self.read_path = read_path
         self.path_ext = path_ext
@@ -151,7 +178,7 @@ def __init__(self, name, read_path, path_ext=None, max_doc_length=1024, preamble
         self.from_text_files = from_text_files
         self.doc_separator = "\n".encode('utf-8')
         self.name = name
-        self.base_path = "/h/stephaneao/"
+        self.base_path = ""#TOFILL
         self.lazy_path = os.path.join(self.base_path, self.name + ".lazy/")
         if not os.path.exists(self.lazy_path):
             os.makedirs(self.lazy_path)
@@ -173,21 +200,21 @@ def create(self):
         self.str_lens = []
         self.init_dataset_stats()
         doc_iter = self.dataset_iterator(self.read_path) if not self.from_text_files else \
-                   self.text_file_iterator(self.read_path, self.path_ext)
+            self.text_file_iterator(self.read_path, self.path_ext)
         for doc_info in doc_iter:
             if len(doc_info) == 1:
-               doc_info = doc_info[0]
-            writes, str_lens, toks, words, sents, documents = doc_info 
+                doc_info = doc_info[0]
+            writes, str_lens, toks, words, sents, documents = doc_info
             if documents != len(str_lens):
                 print("WTF?")
                 print(writes)
                 print(str_lens)
                 print(documents)
-                exit(0)    
+                exit(0)
             self.write_document(writes, str_lens)
             self.update_stats(toks, words, sents, documents)
-        
-        print("Number of lens:", len(self.str_lens))        
+
+        print("Number of lens:", len(self.str_lens))
         pkl.dump(self.str_lens, open(self.len_path, 'wb'))
         self.print_stats()
 
@@ -198,24 +225,27 @@ def init_dataset_stats(self):
         self.document_total = 0
         # self.short_q = queue.PriorityQueue(maxsize=5)
         # self.shortest_len = self.max_doc_length
-        
+
     def print_stats(self):
         stat_str = ""
         stat_str += "Total number of tokens: {}\n".format(self.tok_total)
         stat_str += "Total number of words: {}\n".format(self.word_total)
         stat_str += "Total number of sentences: {}\n".format(self.sentence_total)
         stat_str += "Total number of documents: {}\n".format(self.document_total)
-        stat_str += "Average number of tokens per document: {:.2f}\n".format(float(self.tok_total) / self.document_total)
-        stat_str += "Average number of words per document: {:.2f}\n".format(float(self.word_total) / self.document_total)
-        stat_str += "Average number of sentences per document: {:.2f}\n".format(float(self.sentence_total) / self.document_total)
+        stat_str += "Average number of tokens per document: {:.2f}\n".format(
+            float(self.tok_total) / self.document_total)
+        stat_str += "Average number of words per document: {:.2f}\n".format(
+            float(self.word_total) / self.document_total)
+        stat_str += "Average number of sentences per document: {:.2f}\n".format(
+            float(self.sentence_total) / self.document_total)
 
         print(stat_str)
 
     def write_document(self, writes, str_lens):
         assert len(writes) == len(str_lens)
-        #if type(writes[0]) == tuple:
+        # if type(writes[0]) == tuple:
         #    writes = [w[0] for w in writes]
-        #if type(str_lens[0]) == torch.Tensor:
+        # if type(str_lens[0]) == torch.Tensor:
         #    str_lens = [s.item() for s in str_lens]
         for i in range(len(writes)):
             self.write_file.write(writes[i])
@@ -232,7 +262,7 @@ def update_stats(self, toks, words, sents, documents):
 
     def dataset_iterator(self, paths):
         data_set_args = {
-            'path': paths, # ['wikipedia', 'cnn_dailymail', 'gutenberg'],
+            'path': paths,  # ['wikipedia', 'cnn_dailymail', 'gutenberg'],
             'seq_length': 512,
             'lazy': True,
             'delim': ',',
@@ -255,8 +285,8 @@ def dataset_iterator(self, paths):
         print("Starting length:", len(ds))
 
         fd = FilterDataset(ds, tokenizer, self.max_doc_length)
-        #sampler = torch.utils.data.SequentialSampler(fd)
-        #batch_sampler = torch.utils.data.BatchSampler(sampler, 1, False)
+        # sampler = torch.utils.data.SequentialSampler(fd)
+        # batch_sampler = torch.utils.data.BatchSampler(sampler, 1, False)
 
         data_loader = torch.utils.data.dataloader.DataLoader(fd,
                                                              collate_fn=lambda x: x,
@@ -304,7 +334,7 @@ def text_file_iterator(self, base_read_path, read_paths_exts=None):
                     yield process_document(doc, self.max_doc_length)
 
     def convert_into_sentences(self, text_file):
-        paragraphs= []
+        paragraphs = []
         stack = []
         for chunk in text_file:
             if not chunk.strip():
@@ -320,6 +350,7 @@ def convert_into_sentences(self, text_file):
             sents = text_to_sentences(
                 " ".join(stack).strip().replace('\n', ' ')).split('\n')
             paragraphs.append(sents)
+
         return paragraphs
 
 
@@ -334,6 +365,7 @@ class FilterDataset(data.Dataset):
         dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)
 
     """
+
     def __init__(self, ds, tokenizer, max_len):
         self.ds = ds
         self.tokenizer = tokenizer
@@ -356,9 +388,10 @@ def __getitem__(self, idx):
 
 
 if __name__ == "__main__":
-    #base_read_path = "/scratch/gobi1/datasets/NLP-Corpus/CNN_dailymail/"
-    #read_path_extension = ["cnn/stories/", "dailymail/stories/"]
-    base_read_path = ['wikipedia', 'bookcorpus'] #"/h/stephaneao/bookcorpus_clean"  
-    read_path_extension = None #["books_large_p1_clean.txt", "books_large_p2_clean.txt"]
-    with DatasetWriter("bert_corpus_2", base_read_path, read_path_extension, from_text_files=False, split_on_newlines=True) as dw:
+    # base_read_path = "/scratch/gobi1/datasets/NLP-Corpus/CNN_dailymail/"
+    # read_path_extension = ["cnn/stories/", "dailymail/stories/"]
+    base_read_path = ['wikipedia', 'bookcorpus']  # "/h//bookcorpus_clean"
+    read_path_extension = None  # ["books_large_p1_clean.txt", "books_large_p2_clean.txt"]
+    with DatasetWriter("bert_corpus_2", base_read_path, read_path_extension, from_text_files=False,
+                       split_on_newlines=True) as dw:
         dw.create()
diff --git a/evaluate/config/defaults.conf b/evaluate/config/defaults.conf
index 2b51ba0..ba3652d 100644
--- a/evaluate/config/defaults.conf
+++ b/evaluate/config/defaults.conf
@@ -19,7 +19,6 @@
 //
 // https://github.com/lightbend/config/blob/master/HOCON.md
 
-// ADDED BY STEPHANE
 bert_use_pretrain = 0 // If true, use pretrained model defined by 'bert_model_name'
                       // else create bert model using "bert_config_file"
 
diff --git a/evaluate/config/test_bert.conf b/evaluate/config/test_bert.conf
index 3a102ba..dd5f7b4 100644
--- a/evaluate/config/test_bert.conf
+++ b/evaluate/config/test_bert.conf
@@ -47,5 +47,5 @@ classifier_loss_fn = "softmax"
 classifier_span_pooling = "attn"
 
 // Added args
-bert_use_pretrain = 0 // If true, use pretrained model defined by 'bert_model_name' -- ADDED BY STEPHANE
+bert_use_pretrain = 0 // If true, use pretrained model defined by 'bert_model_name' -- ADDED BY
                       // else create bert model using "bert_config_file"
diff --git a/evaluate/evaluate.py b/evaluate/evaluate.py
index 1bffabc..51f2f61 100644
--- a/evaluate/evaluate.py
+++ b/evaluate/evaluate.py
@@ -24,7 +24,6 @@
 from .tasks.qa import MultiRCTask, ReCoRDTask
 from .tasks.edge_probing import EdgeProbingTask
 
-
 LOG_INTERVAL = 30
 
 
@@ -45,7 +44,7 @@ def parse_write_preds_arg(write_preds_arg: str) -> List[str]:
 
 
 def evaluate(
-    model, tasks: Sequence[tasks_module.Task], batch_size: int, cuda_device: int, split="val"
+        model, tasks: Sequence[tasks_module.Task], batch_size: int, cuda_device: int, split="val"
 ) -> Tuple[Dict, pd.DataFrame]:
     """Evaluate on a dataset
     {par,qst,ans}_idx are used for MultiRC and other question answering dataset"""
@@ -145,7 +144,7 @@ def evaluate(
 
 
 def write_preds(
-    tasks: Iterable[tasks_module.Task], all_preds, pred_dir, split_name, strict_glue_format=False
+        tasks: Iterable[tasks_module.Task], all_preds, pred_dir, split_name, strict_glue_format=False
 ) -> None:
     for task in tasks:
         if task.name not in all_preds:
@@ -255,11 +254,11 @@ def _get_pred_filename(task_name, pred_dir, split_name, strict_glue_format):
 
 
 def _write_edge_preds(
-    task: EdgeProbingTask,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    join_with_input: bool = True,
+        task: EdgeProbingTask,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        join_with_input: bool = True,
 ):
     """ Write predictions for edge probing task.
 
@@ -293,11 +292,11 @@ def _write_edge_preds(
 
 
 def _write_wic_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for WiC task.  """
     pred_map = {0: "false", 1: "true"}
@@ -312,11 +311,11 @@ def _write_wic_preds(
 
 
 def _write_winograd_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for Winograd Coreference task.  """
     pred_map = {0: "False", 1: "True"}
@@ -331,11 +330,11 @@ def _write_winograd_preds(
 
 
 def _write_boolq_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for Boolean Questions task.  """
     pred_map = {0: "false", 1: "true"}
@@ -350,11 +349,11 @@ def _write_boolq_preds(
 
 
 def _write_commitment_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for CommitmentBank task.  """
     pred_map = {0: "neutral", 1: "entailment", 2: "contradiction"}
@@ -369,7 +368,7 @@ def _write_commitment_preds(
 
 
 def _write_copa_preds(
-    task, preds_df: pd.DataFrame, pred_dir: str, split_name: str, strict_glue_format: bool = False
+        task, preds_df: pd.DataFrame, pred_dir: str, split_name: str, strict_glue_format: bool = False
 ):
     """ Write COPA predictions to JSONL """
     preds_file = _get_pred_filename(task.name, pred_dir, split_name, strict_glue_format)
@@ -383,11 +382,11 @@ def _write_copa_preds(
 
 
 def _write_multirc_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for MultiRC task. """
     preds_file = _get_pred_filename(task.name, pred_dir, split_name, strict_glue_format)
@@ -411,11 +410,11 @@ def _write_multirc_preds(
 
 
 def _write_record_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for ReCoRD task. """
     preds_file = _get_pred_filename(task.name, pred_dir, split_name, strict_glue_format)
@@ -431,7 +430,6 @@ def _write_record_preds(
                 par_qst_ans_d[row["psg_idx"]][row["qst_idx"]].append(ans_d)
             for par_idx, qst_ans_d in par_qst_ans_d.items():
                 for qst_idx, ans_ds in qst_ans_d.items():
-
                     # get prediction
                     logits_and_anss = [(d["logit"], d["str"]) for d in ans_ds]
                     logits_and_anss.sort(key=lambda x: x[1])
@@ -449,11 +447,11 @@ def _write_record_preds(
 
 
 def _write_rte_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for RTE task in SuperGLUE prediction format.  """
     trg_map = {0: "not_entailment", 1: "entailment"}
@@ -468,11 +466,11 @@ def _write_rte_preds(
 
 
 def _write_diagnostics_preds(
-    task: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions for GLUE/SuperGLUE diagnostics task.  """
 
@@ -494,11 +492,11 @@ def _write_diagnostics_preds(
 
 
 def _write_glue_preds(
-    task_name: str,
-    preds_df: pd.DataFrame,
-    pred_dir: str,
-    split_name: str,
-    strict_glue_format: bool = False,
+        task_name: str,
+        preds_df: pd.DataFrame,
+        pred_dir: str,
+        split_name: str,
+        strict_glue_format: bool = False,
 ):
     """ Write predictions to separate files located in pred_dir.
     We write special code to handle various GLUE tasks.
diff --git a/idf.py b/idf.py
index d5e1729..deb86b7 100644
--- a/idf.py
+++ b/idf.py
@@ -1,3 +1,7 @@
+"""
+Script to calculate the inverse document frequency (idf) used in tf-idf labels of a dataset.
+"""
+
 from sentence_encoders import data_utils
 import numpy as np
 from math import ceil, log
@@ -27,7 +31,7 @@
 }
 
 
-def sentence_tokenize(tokenizer, sent, sentence_num=0, beginning=False, ending=False):
+def sentence_tokenize(tokenizer, sent):
     """tokenize sentence and get token types if tokens=True"""
     tokens = tokenizer.EncodeAsIds(sent).tokenization
     return tokens
diff --git a/model/new_models.py b/model/new_models.py
index d502e42..72f8e7e 100644
--- a/model/new_models.py
+++ b/model/new_models.py
@@ -147,7 +147,7 @@ def __init__(self, config, modes=["mlm"]):
             self.sent["fs"] = BertHeadTransform(config)
             self.tok["fs"] = BertHeadTransform(config)
         if "tgs" in modes:
-            self.tok["tgs"] = BertTokenHead(config, num_classes=2, input_size=config.hidden_size * 2)
+            self.tok["tgs"] = BertTokenHead(config, num_classes=6, input_size=config.hidden_size * 3)
         self.apply(self.init_bert_weights)
 
     def forward(self, modes, input_ids, token_type_ids=None, task_ids=None, attention_mask=None, masked_lm_labels=None,
@@ -188,9 +188,11 @@ def forward(self, modes, input_ids, token_type_ids=None, task_ids=None, attentio
             #ref = torch.zeros_like(sim)
             scores["fs"] = sim #torch.stack((ref, sim), dim=1)
         if "sbo" in modes:
-            output_concats = [torch.cat((sequence_output[:, 0], sequence_output[:, 0]), dim=-1)]
-            for i in range(sequence_output.shape[1] - 2):
-                output_concats +=  [torch.cat((sequence_output[:, i], sequence_output[:, i + 2]), dim=-1)]
+            output_concats = [torch.cat((sequence_output[:, 0], sequence_output[:, 0], sequence_output[:, 0]), dim=-1)]
+            output_concats += [torch.cat((sequence_output[:, 0], sequence_output[:, 0], sequence_output[:, 1]), dim=-1)]
+            for i in range(2, sequence_output.shape[1]):
+                output_concats += [torch.cat((sequence_output[:, i - 2], sequence_output[:, i - 1],
+                                              sequence_output[:, i]), dim=-1)]
             output_concats += [torch.cat((sequence_output[:, i + 2], sequence_output[:, i + 2]), dim=-1)]
             output_concats = torch.stack(output_concats, dim=1)
             scores["sbo"] = self.tok["sbo"](output_concats)
diff --git a/paths.py b/paths.py
index 20824d6..d0ea973 100644
--- a/paths.py
+++ b/paths.py
@@ -1,15 +1,15 @@
 import os
 
 # Where the repository exists
-base_path = "/h/stephaneao/sentence_encoders/"
+base_path = ""
 bert_config_file = os.path.join(base_path, "bert_config.json")
 
 # Where you want to save models (this requires lots of space - better on hhds)
-save_path = "/scratch/hdd001/home/stephaneao/"
+save_path = ""
 pretrained_path = os.path.join(save_path, "pretrained_berts")
 finetuned_path = os.path.join(save_path, "finetuned_berts")
 
 # Where you are loading the data from (better on ssd if possible for faster reads)
-data_path = "/scratch/ssd001/home/stephaneao"
+data_path = ""
 glue_data_path = os.path.join(data_path, "glue_data")
 train_data_path = os.path.join(data_path, "train_data")
diff --git a/pretrain_bert.py b/pretrain_bert.py
index 24a3db1..5068ee9 100755
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -171,7 +171,7 @@ def forward_step(data, model, criterion, modes, args):
             loss_mask = loss_mask.view(-1).contiguous()
             losses[mode] = torch.sum(mlm_loss * loss_mask.view(-1).float()) / loss_mask.sum()
         elif mode == "tgs":
-            tgs_loss = criterion_cls(score.view(-1, 2).contiguous().float(),
+            tgs_loss = criterion_cls(score.view(-1, 6).contiguous().float(),
                                      aux_labels[mode].view(-1).contiguous())
             tgs_loss = tgs_loss.view(-1).contiguous()
             losses[mode] = torch.sum(tgs_loss * tgs_mask.view(-1).float() / tgs_mask.sum())
@@ -268,7 +268,7 @@ def next_stage():
         nonlocal stage_idx
         if stage_idx >= len(stage_splits):
             print("Finished all training, shouldn't reach this unless it's the very final iteration")
-            return {k: total_tokens for k in modes}
+            return {k: float(total_tokens) for k in modes}
         assert len(modes) == len(stage_splits[stage_idx])
         current_stage = {k: v for k, v in zip(modes, stage_splits[stage_idx])}
         print("Starting stage {} of {}, with task distribution: ".format(stage_idx, len(stage_splits)))
diff --git a/requirements.txt b/requirements.txt
index b4eb4b4..d79da22 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,200 @@
-nltk>=3.4
-numpy>=1.15.4
-pandas>=0.24.0
-sentencepiece>=0.1.8
-tensorflow>=1.12.0
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+_libgcc_mutex=0.1=main
+absl-py=0.7.1=pypi_0
+alabaster=0.7.12=pypi_0
+allennlp=0.8.4=pypi_0
+apex=0.1=pypi_0
+aspy-yaml=1.3.0=pypi_0
+astor=0.8.0=pypi_0
+atomicwrites=1.3.0=pypi_0
+attrs=19.1.0=pypi_0
+awscli=1.16.207=pypi_0
+babel=2.7.0=pypi_0
+backcall=0.1.0=py36_0
+blas=1.0=mkl
+bokeh=1.2.0=py36_0
+boto3=1.9.197=pypi_0
+botocore=1.12.197=pypi_0
+ca-certificates=2019.10.16=0
+cachetools=3.1.1=pypi_0
+certifi=2019.9.11=py36_0
+cffi=1.12.3=py36h2e261b9_0
+cfgv=2.0.1=pypi_0
+chardet=3.0.4=pypi_0
+click=7.0=pypi_0
+colorama=0.3.9=pypi_0
+comet-git-pure=0.19.11=pypi_0
+comet-ml=2.0.16=pypi_0
+configobj=5.0.6=pypi_0
+conllu=0.11=pypi_0
+coverage=4.5.3=pypi_0
+cudatoolkit=10.1.243=h6bb024c_0
+cycler=0.10.0=pypi_0
+cymem=1.31.2=pypi_0
+cytoolz=0.9.0.1=pypi_0
+decorator=4.4.0=py36_1
+dill=0.2.9=pypi_0
+docutils=0.14=pypi_0
+editdistance=0.5.3=pypi_0
+everett=1.0.2=pypi_0
+flaky=3.6.0=pypi_0
+flask=1.1.1=pypi_0
+flask-cors=3.0.8=pypi_0
+freetype=2.9.1=h8a8886c_1
+ftfy=5.4.1=pypi_0
+gast=0.2.2=pypi_0
+gevent=1.4.0=pypi_0
+google-api-core=1.14.0=pypi_0
+google-auth=1.6.3=pypi_0
+google-cloud-core=1.0.2=pypi_0
+google-cloud-logging=1.11.0=pypi_0
+google-pasta=0.1.7=pypi_0
+googleapis-common-protos=1.6.0=pypi_0
+greenlet=0.4.15=pypi_0
+grpcio=1.22.0=pypi_0
+h5py=2.9.0=pypi_0
+identify=1.4.5=pypi_0
+idna=2.8=pypi_0
+imagesize=1.1.0=pypi_0
+importlib-metadata=0.18=pypi_0
+importlib-resources=1.0.2=pypi_0
+intel-openmp=2019.4=243
+ipdb=0.12.1=pypi_0
+ipykernel=5.1.1=py36h39e3cac_0
+ipython=7.6.1=py36h39e3cac_0
+ipython_genutils=0.2.0=py36_0
+itsdangerous=1.1.0=pypi_0
+jedi=0.13.3=py36_0
+jinja2=2.10.1=py36_0
+jmespath=0.9.4=pypi_0
+jpeg=9b=h024ee3a_2
+jsondiff=1.1.2=py36_0
+jsonnet=0.13.0=pypi_0
+jsonpickle=1.2=pypi_0
+jsonschema=3.0.1=pypi_0
+jupyter_client=5.3.1=py_0
+jupyter_core=4.5.0=py_0
+keras-applications=1.0.8=pypi_0
+keras-preprocessing=1.1.0=pypi_0
+kiwisolver=1.1.0=pypi_0
+libedit=3.1.20181209=hc058e9b_0
+libffi=3.2.1=hd88cf55_4
+libgcc-ng=9.1.0=hdf63c60_0
+libgfortran-ng=7.3.0=hdf63c60_0
+libpng=1.6.37=hbc83047_0
+libsodium=1.0.16=h1bed415_0
+libstdcxx-ng=9.1.0=hdf63c60_0
+libtiff=4.0.10=h2733197_2
+markdown=3.1.1=pypi_0
+markupsafe=1.1.1=py36h7b6447c_0
+matplotlib=3.1.1=pypi_0
+mkl=2018.0.3=1
+mkl_fft=1.0.6=py36h7dd41cf_0
+mkl_random=1.0.1=py36h4414c95_1
+more-itertools=7.2.0=pypi_0
+msgpack=0.6.1=pypi_0
+msgpack-numpy=0.4.4.3=pypi_0
+murmurhash=0.28.0=pypi_0
+ncurses=6.1=he6710b0_1
+netifaces=0.10.9=pypi_0
+ninja=1.9.0=py36hfd86e86_0
+nltk=3.4.4=pypi_0
+nodeenv=1.3.3=pypi_0
+nose2=0.9.1=pypi_0
+numpy=1.17.0=pypi_0
+numpydoc=0.9.1=pypi_0
+nvidia-ml-py3=7.352.0=pypi_0
+olefile=0.46=py36_0
+openssl=1.1.1d=h7b6447c_3
+overrides=1.9=pypi_0
+packaging=19.0=py36_0
+pandas=0.25.0=pypi_0
+parsimonious=0.8.1=pypi_0
+parso=0.5.0=py_0
+pathlib=1.0.1=pypi_0
+pexpect=4.7.0=py36_0
+pickleshare=0.7.5=py36_0
+pillow=6.1.0=py36h34e0f95_0
+pip=19.1.1=py36_0
+plac=0.9.6=pypi_0
+pluggy=0.12.0=pypi_0
+pre-commit=1.15.2=pypi_0
+preshed=1.0.1=pypi_0
+prompt_toolkit=2.0.9=py36_0
+protobuf=3.9.0=pypi_0
+psutil=5.6.4=pypi_0
+ptyprocess=0.6.0=py36_0
+py=1.8.0=pypi_0
+pyasn1=0.4.5=pypi_0
+pyasn1-modules=0.2.5=pypi_0
+pycparser=2.19=py36_0
+pygments=2.4.2=py_0
+pyhocon=0.3.35=pypi_0
+pyparsing=2.4.0=py_0
+pyrsistent=0.15.4=pypi_0
+pytest=5.0.1=pypi_0
+python=3.6.8=h0371630_0
+python-dateutil=2.8.0=py36_0
+python-http-client=3.1.0=pypi_0
+python-levenshtein=0.12.0=pypi_0
+pytorch=1.3.0=py3.6_cuda10.1.243_cudnn7.6.3_0
+pytorch-pretrained-bert=0.6.2=pypi_0
+pytz=2017.3=py36h63b9c63_0
+pyyaml=5.1.1=py36h7b6447c_0
+pyzmq=18.0.0=py36he6710b0_0
+readline=7.0=h7b6447c_5
+regex=2017.4.5=pypi_0
+requests=2.22.0=pypi_0
+responses=0.10.6=pypi_0
+rsa=3.4.2=pypi_0
+s3transfer=0.2.1=pypi_0
+scikit-learn=0.19.1=py36hedc7406_0
+scipy=1.1.0=py36hd20e5f9_0
+sendgrid=5.4.1=pypi_0
+sentencepiece=0.1.82=pypi_0
+setuptools=41.0.1=py36_0
+six=1.12.0=py36_0
+snowballstemmer=1.9.0=pypi_0
+spacy=2.0.11=pypi_0
+sphinx=2.1.2=pypi_0
+sphinxcontrib-applehelp=1.0.1=pypi_0
+sphinxcontrib-devhelp=1.0.1=pypi_0
+sphinxcontrib-htmlhelp=1.0.2=pypi_0
+sphinxcontrib-jsmath=1.0.1=pypi_0
+sphinxcontrib-qthelp=1.0.2=pypi_0
+sphinxcontrib-serializinghtml=1.1.3=pypi_0
+sqlite=3.29.0=h7b6447c_0
+sqlparse=0.3.0=pypi_0
+tensorboard=1.14.0=pypi_0
+tensorboardx=1.2=pypi_0
+tensorflow=1.14.0=pypi_0
+tensorflow-estimator=1.14.0=pypi_0
+termcolor=1.1.0=pypi_0
+thinc=6.10.3=pypi_0
+tk=8.6.8=hbc83047_0
+toml=0.10.0=pypi_0
+toolz=0.10.0=pypi_0
+torch=1.1.0=pypi_0
+torchvision=0.4.1=py36_cu101
+tornado=6.0.3=py36h7b6447c_0
+tqdm=4.32.2=pypi_0
+traitlets=4.3.2=py36_0
+ujson=1.35=pypi_0
+unidecode=1.1.1=pypi_0
+urllib3=1.25.3=pypi_0
+virtualenv=16.7.2=pypi_0
+wcwidth=0.1.7=py36_0
+websocket-client=0.56.0=pypi_0
+werkzeug=0.15.5=pypi_0
+wheel=0.33.4=py36_0
+word2number=1.1=pypi_0
+wrapt=1.11.2=pypi_0
+wurlitzer=1.0.3=pypi_0
+xz=5.2.4=h14c3975_4
+yaml=0.1.7=had09818_2
+zeromq=4.3.1=he6710b0_3
+zipp=0.5.2=pypi_0
+zlib=1.2.11=h7b6447c_3
+zstd=1.3.7=h0b5b093_0
diff --git a/scripts/pretrain_bert.sh b/scripts/pretrain_bert.sh
index 5428268..9d92f9d 100755
--- a/scripts/pretrain_bert.sh
+++ b/scripts/pretrain_bert.sh
@@ -4,31 +4,3 @@ RANK=0
 WORLD_SIZE=1
 
 python3 -m sentence_encoders.pretrain_bert "$@"
-
-
-#    --batch-size 32 \
-#    --tokenizer-type BertWordPieceTokenizer \
-#    --cache-dir cache_dir \
-#    --tokenizer-model-type bert-base-uncased \
-#    --vocab-size 30522 \
-#    --train-data 'wikipedia' \
-#    --presplit-sentences \
-#    --text-key text \
-#    --split 1000,1,1 \
-#    --lazy-loader \
-#    --max-preds-per-seq 80 \
-#    --seq-length 128 \
-#    --train-tokens 500000000 \
-#    --lr 0.0001 \
-#    --lr-decay-style linear \
-#    --warmup .01 \
-#    --weight-decay 1e-2 \
-#    --clip-grad 1.0 \
-#    --num-workers 2 \
-#    --epochs 2 \
-#    --bert-config-file /h/stephaneao/sentence_encoders/bert_config.json \
-#    --save /scratch/gobi2/stephaneao/trained_berts/bert/ \
-#    --model-type 'bert' \
-#    --modes 'mlm,nsp' \
-#    --incremental False \
-#    --track-results True