asafamr · dayyass · Jun 26, 2021 · Jun 26, 2021 · Jun 27, 2021 · Jun 28, 2021
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-tqdm==4.26.0
-pytorch_pretrained_bert==0.6.1
+tqdm>=4.27
+transformers==4.8.1
 torch==1.0.0
-numpy==1.15.1
+numpy>=1.17
 spacy==2.1.3
 scipy==1.1.0
 scikit_learn==0.21.1

diff --git a/wsi/WSISettings.py b/wsi/WSISettings.py
@@ -2,7 +2,7 @@
 
 WSISettings = namedtuple('WSISettings', ['n_represents', 'n_samples_per_rep', 'cuda_device', 'debug_dir',
  'disable_tfidf', 'disable_lemmatization', 'run_name', 'patterns',
- 'min_sense_instances', 'bert_model',
+ 'min_sense_instances', 'bert_model', 'spacy_lang',
  'max_batch_size', 'prediction_cutoff', 'max_number_senses',
  ])
 
@@ -26,6 +26,7 @@
  # sense clusters that dominate less than this number of samples
  # would be remapped to their closest big sense
 
+ spacy_lang="en",
  max_batch_size=10,
  prediction_cutoff=200,
  bert_model='bert-large-uncased'

diff --git a/wsi/lm_bert.py b/wsi/lm_bert.py
@@ -1,6 +1,6 @@
 from .slm_interface import SLM
 import multiprocessing
-from pytorch_pretrained_bert import BertForMaskedLM, tokenization
+from transformers import BertForMaskedLM, BertTokenizer
 import torch
 import numpy as np
 from tqdm import tqdm
@@ -26,7 +26,7 @@ def get_batches(from_iter, group_size):
 
 class LMBert(SLM):
 
- def __init__(self, cuda_device, bert_model, max_batch_size=20):
+ def __init__(self, cuda_device, bert_model, spacy_lang="en", max_batch_size=20):
  super().__init__()
  logging.info(
  'creating bert in device %d. bert ath %s'
@@ -43,7 +43,7 @@ def __init__(self, cuda_device, bert_model, max_batch_size=20):
  model.eval()
  self.bert = model
 
- self.tokenizer = tokenization.BertTokenizer.from_pretrained(bert_model)
+ self.tokenizer = BertTokenizer.from_pretrained(bert_model)
 
  self.max_sent_len = model.config.max_position_embeddings
  # self.max_sent_len = config.max_position_embeddings
@@ -54,7 +54,7 @@ def __init__(self, cuda_device, bert_model, max_batch_size=20):
  self.original_vocab = []
 
  import spacy
- nlp = spacy.load("en", disable=['ner', 'parser'])
+ nlp = spacy.load(spacy_lang, disable=['ner', 'parser'])
  self._lemmas_cache = {}
  self._spacy = nlp
  for spacyed in tqdm(
@@ -141,7 +141,7 @@ def predict_sent_substitute_representatives(self, inst_id_to_sentence: Dict[str,
 
  torch_mask = torch_input_ids != 0
 
- logits_all_tokens = self.bert(torch_input_ids, attention_mask=torch_mask)
+ logits_all_tokens = self.bert(torch_input_ids, attention_mask=torch_mask).logits
 
  logits_target_tokens = torch.zeros((len(batch_sents), logits_all_tokens.shape[2])).to(self.device)
  for i in range(0, len(batch_sents)):

diff --git a/wsi_bert.py b/wsi_bert.py
@@ -34,6 +34,7 @@
  startmsg = startmsg.strip()
 
  lm = LMBert(settings.cuda_device, settings.bert_model,
+ spacy_lang=settings.spacy_lang,
  max_batch_size=settings.max_batch_size)
 
  if settings.debug_dir: