From fcd12f3a5c281f29ce50a95785b3e187fb738d5a Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 29 Apr 2022 10:48:21 -0700 Subject: [PATCH 01/20] Integrated working PyTorch-CRF in MM --- mindmeld/models/tagger_models.py | 14 +- mindmeld/models/taggers/crf.py | 145 ++++++++++ mindmeld/models/taggers/pytorch_crf.py | 381 +++++++++++++++++++++++++ 3 files changed, 534 insertions(+), 6 deletions(-) create mode 100644 mindmeld/models/taggers/pytorch_crf.py diff --git a/mindmeld/models/tagger_models.py b/mindmeld/models/tagger_models.py index 73232c2ae..7f6d29044 100644 --- a/mindmeld/models/tagger_models.py +++ b/mindmeld/models/tagger_models.py @@ -27,7 +27,7 @@ ) from .model import ModelConfig, Model, PytorchModel, AbstractModelFactory from .nn_utils import get_token_classifier_cls, TokenClassificationType -from .taggers.crf import ConditionalRandomFields +from .taggers.crf import ConditionalRandomFields, PyTorchCRF from .taggers.memm import MemmModel from ..exceptions import MindMeldError @@ -73,12 +73,13 @@ class TaggerModel(Model): CRF_TYPE = "crf" MEMM_TYPE = "memm" LSTM_TYPE = "lstm" - ALLOWED_CLASSIFIER_TYPES = [CRF_TYPE, MEMM_TYPE, LSTM_TYPE] + TORCH_CRF_TYPE = "torch-crf" + ALLOWED_CLASSIFIER_TYPES = [CRF_TYPE, MEMM_TYPE, LSTM_TYPE, TORCH_CRF_TYPE] # for default model scoring types ACCURACY_SCORING = "accuracy" SEQ_ACCURACY_SCORING = "seq_accuracy" - SEQUENCE_MODELS = ["crf"] + SEQUENCE_MODELS = ["crf", "torch-crf"] DEFAULT_FEATURES = { "bag-of-words-seq": { @@ -131,6 +132,7 @@ def _get_model_constructor(self): return { TaggerModel.MEMM_TYPE: MemmModel, TaggerModel.CRF_TYPE: ConditionalRandomFields, + TaggerModel.TORCH_CRF_TYPE: PyTorchCRF, TaggerModel.LSTM_TYPE: LstmModel, }[classifier_type] except KeyError as e: @@ -231,7 +233,7 @@ def fit(self, examples, labels, params=None): "There are no labels in this label set, so we don't fit the model." ) return self - # Extract labels - label encoders are the same accross all entity recognition models + # Extract labels - label encoders are the same across all entity recognition models self._label_encoder = get_label_encoder(self.config) y = self._label_encoder.encode(labels, examples=examples) @@ -246,8 +248,8 @@ def fit(self, examples, labels, params=None): self._current_params = params else: # run cross validation to select params - if self._clf.__class__ == LstmModel: - raise MindMeldError("The LSTM model does not support cross-validation") + if self._clf.__class__ in (LstmModel, PyTorchCRF): + raise MindMeldError(f"The {self._clf.__class__.__name__} model does not support cross-validation") _, best_params = self._fit_cv(X, y, groups) self._clf = self._fit(X, y, best_params) diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index b373b868d..bc5b24dd0 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -23,6 +23,7 @@ from .taggers import Tagger, extract_sequence_features from ..helpers import FileBackedList +from .pytorch_crf import TorchCRF logger = logging.getLogger(__name__) @@ -182,6 +183,150 @@ def setup_model(self, config): self._feat_binner = FeatureBinner() +class PyTorchCRF(Tagger): + """A Conditional Random Fields model.""" + + @staticmethod + def _predict_proba(X): + del X + pass + + @staticmethod + def load(model_path): + del model_path + pass + + def fit(self, X, y): + self._clf.fit(X, y) + return self + + def set_params(self, **parameters): + self._clf = TorchCRF() + self._clf.set_params(**parameters) + return self + + def get_params(self, deep=True): + return self._clf.get_params() + + def predict(self, X, dynamic_resource=None): + return self._clf.predict(X) + + def predict_proba(self, examples, config, resources): + """ + Args: + examples (list of mindmeld.core.Query): a list of queries to predict on + config (ModelConfig): The ModelConfig which may contain information used for feature + extraction + resources (dict): Resources which may be used for this model's feature extraction + + Returns: + list of tuples of (mindmeld.core.QueryEntity): a list of predicted labels \ + with confidence scores + """ + X, _, _ = self.extract_features(examples, config, resources, in_memory=True) + seq = self._clf.predict(X) + marginals_dict = self._clf.predict_marginals(X) + marginal_tuples = [] + for query_index, query_seq in enumerate(seq): + query_marginal_tuples = [] + for i, tag in enumerate(query_seq): + query_marginal_tuples.append([tag, marginals_dict[query_index][i][tag]]) + marginal_tuples.append(query_marginal_tuples) + return marginal_tuples + + def predict_proba_distribution(self, examples, config, resources): + """ + Args: + examples (list of mindmeld.core.Query): a list of queries to predict on + config (ModelConfig): The ModelConfig which may contain information used for feature + extraction + resources (dict): Resources which may be used for this model's feature extraction + + Returns: + list of tuples of (mindmeld.core.QueryEntity): a list of predicted labels \ + with confidence scores + """ + X, _, _ = self.extract_features(examples, config, resources, in_memory=True) + seq = self._clf.predict(X) + marginals_dict = self._clf.predict_marginals(X) + predictions = [] + tag_maps = [] + for query_index, query_seq in enumerate(seq): + tags = [] + preds = [] + for i in range(len(query_seq)): + tags.append(list(marginals_dict[query_index][i].keys())) + preds.append(list(marginals_dict[query_index][i].values())) + tag_maps.extend(tags) + predictions.extend(preds) + return [[tag_maps, predictions]] + + def extract_features(self, + examples, + config, + resources, + y=None, + fit=False, + in_memory=STORE_CRF_FEATURES_IN_MEMORY): + """Transforms a list of examples into a feature matrix. + + Args: + examples (list of mindmeld.core.Query): a list of queries + config (ModelConfig): The ModelConfig which may contain information used for feature + extraction + resources (dict): Resources which may be used for this model's feature extraction + + Returns: + (list of list of str): features in CRF suite format + """ + # Extract features and classes + feats = [] if in_memory else FileBackedList() + for _, example in enumerate(examples): + feats.append(self.extract_example_features(example, config, resources)) + X = self._preprocess_data(feats, fit) + return X, y, None + + @staticmethod + def extract_example_features(example, config, resources): + """Extracts feature dicts for each token in an example. + + Args: + example (mindmeld.core.Query): A query. + config (ModelConfig): The ModelConfig which may contain information used for feature \ + extraction. + resources (dict): Resources which may be used for this model's feature extraction. + + Returns: + list[dict]: Features. + """ + return extract_sequence_features( + example, config.example_type, config.features, resources + ) + + def _preprocess_data(self, X, fit=False): + """Converts data into formats of CRF suite. + + Args: + X (list of dict): features of an example + fit (bool, optional): True if processing data at fit time, false for predict time. + + Returns: + (list of list of str): features in CRF suite format + """ + if fit: + self._feat_binner.fit(X) + + # We want to use a list for in-memory and a LineGenerator for disk based + new_X = X.__class__() + # Maintain append code structure to make sure it supports in-memory and FileBackedList() + for feat_seq in self._feat_binner.transform(X): + new_X.append(feat_seq) + return new_X + + def setup_model(self, config): + self._feat_binner = FeatureBinner() + + # Feature extraction for CRF diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py new file mode 100644 index 000000000..9ba351212 --- /dev/null +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -0,0 +1,381 @@ +import logging +import os +import random +import uuid +from collections import Counter +from copy import copy +from itertools import zip_longest +from random import randint + +import numpy as np +import torch +import torch.nn as nn +from sklearn.feature_extraction import DictVectorizer, FeatureHasher +from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder +from torch import optim +from torch.utils.data import Dataset, DataLoader +from torch_sparse import cat, tensor +from torchcrf import CRF +from tqdm import tqdm + +from ...exceptions import MindMeldError +from ...path import USER_CONFIG_DIR + +logger = logging.getLogger(__name__) + +DEFAULT_PYTORCH_CRF_ER_CONFIG = { + "feat_type": "hash", # ["hash", "dict"] + "feat_num": 50000, + "stratify": True, + "drop_input": 0.2, + "train_batch_size": 8, + "patience": 3, + "epochs": 100, + "train_dev_split": 0.15, + "optimizer_type": "sgd", # ["sgd", "adam"] +} + + +class TaggerDataset(Dataset): + def __init__(self, input, seq_lens, labels=None): + self.input = input + self.labels = labels + self.seq_lens = seq_lens + self.max_seq_length = max(seq_lens) + + def __len__(self): + return len(self.seq_lens) + + def __getitem__(self, index): + mask_list = [1] * self.seq_lens[index] + [0] * (self.max_seq_length - self.seq_lens[index]) + + mask = torch.as_tensor(mask_list, dtype=torch.bool) + if self.labels: + return self.input[index], mask, self.labels[index] + else: + return self.input[index], mask + + +def init_weights(m): + if type(m) == nn.Linear: + torch.nn.init.xavier_normal_(m.weight) + m.bias.data.fill_(0.01) + + +def custom_collate(sequence): + if len(sequence[0]) == 3: + sparse_mats, masks, labels = zip(*sequence) + return cat(sparse_mats, dim=(0, 1)).to_torch_sparse_coo_tensor().coalesce(), torch.stack(masks), torch.stack( + labels) + elif len(sequence[0]) == 2: + sparse_mats, masks = zip(*sequence) + return cat(sparse_mats, dim=(0, 1)).to_torch_sparse_coo_tensor().coalesce(), torch.stack(masks) + + +class Encoder: + def __init__(self, feature_extractor="dict", num_feats=None): + + self.feat_extractor = DictVectorizer(dtype=np.float32) if feature_extractor == "dict" else FeatureHasher( + n_features=num_feats, dtype=np.float32) + self.label_encoder = LabelEncoder() + self.feat_extract_type = feature_extractor + self.num_classes = None + self.classes = None + self.num_feats = num_feats + self.fit_done = False + + def get_tensor_data(self, feat_dicts, labels=None, fit=False): + if labels is None: + labels = [] + if fit: + if self.feat_extract_type == "dict": + comb_dict_list = [x for seq in feat_dicts for x in seq] + self.feat_extractor.fit(comb_dict_list) + self.num_feats = len(self.feat_extractor.get_feature_names()) + if labels: + self.label_encoder.fit([x for l in labels for x in l]) + self.pad_index = len(self.label_encoder.classes_) - 1 + self.classes = self.label_encoder.classes_ + self.num_classes = len(self.label_encoder.classes_) + + self.fit_done = True + feats = [] + encoded_labels = [] + seq_lens = [len(x) for x in feat_dicts] + max_seq_len = max(seq_lens) + + for i, (x, y) in enumerate(zip_longest(feat_dicts, labels)): + + padded_x = x + [{}] * (max_seq_len - seq_lens[i]) + sparse_feat = self.feat_extractor.transform(padded_x) + sparse_feat_tensor = tensor.from_scipy(sparse_feat) + feats.append(sparse_feat_tensor) + + if y: + transformed_label = self.label_encoder.transform(y) + transformed_label = np.pad(transformed_label, pad_width=(0, max_seq_len - seq_lens[i]), + constant_values=self.pad_index) + label_tensor = torch.as_tensor(transformed_label, dtype=torch.long) + encoded_labels.append(label_tensor) + return (feats, encoded_labels, seq_lens) if encoded_labels else (feats, seq_lens) + + +class TorchCRF(nn.Module): + def __init__(self): + super(TorchCRF, self).__init__() + self.optimizer = None + self.encoder = None + self.best_model_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") + os.makedirs(os.path.dirname(self.best_model_save_path), exist_ok=True) + + def set_random_states(self): + torch.manual_seed(self.random_state) + random.seed(self.random_state + 1) + np.random.seed(self.random_state + 2) + + def validate_params(self): + if self.optimizer_type not in ["sgd", "adam"]: + raise MindMeldError( + f"Optimizer type {self.optimizer_type} not supported. Supported options are ['sgd', 'adam']") + elif self.feat_type not in ["hash", "dict"]: + raise MindMeldError(f"Feature type {self.feat_type} not supported. Supported options are ['hash', 'dict']") + elif not 0 < self.train_dev_split < 1: + raise MindMeldError(f"Train-dev split should be a value between 0 and 1.") + elif not 0 <= self.drop_input < 1: + raise MindMeldError(f"Drop Input should be a value between 0 and 1. (inclusive)") + + for x, y in zip([self.feat_num, self.train_batch_size, self.patience, self.epochs], + ["Number of features", "Train Batch size", "Patience", "Number of epochs"]): + if not isinstance(x, int): + raise MindMeldError(f"{y} should be am integer value.") + + def build_params(self, num_features, num_classes): + self.W = nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(size=(num_features, num_classes))), + requires_grad=True) + self.b = nn.Parameter(torch.nn.init.constant_(torch.empty(size=(num_classes,)), val=0.01), + requires_grad=True) + self.crf_layer = CRF(num_classes, batch_first=True) + self.crf_layer.apply(init_weights) + self.num_classes = num_classes + + def forward(self, inputs, targets, mask, drop_input=0.0): + if drop_input: + dp_mask = (torch.FloatTensor(inputs.values().size()).uniform_() > drop_input) + inputs.values()[:] = inputs.values() * dp_mask + dense_W = torch.tile(self.W, dims=(mask.shape[0], 1)) + out_1 = torch.addmm(self.b, inputs, dense_W) + crf_input = out_1.reshape((mask.shape[0], -1, self.num_classes)) + if targets is None: + return self.crf_layer.decode(crf_input, mask=mask) + loss = - self.crf_layer(crf_input, targets, mask=mask) + return loss + + # The below implementation is borrowed from https://github.com/kmkurn/pytorch-crf/pull/37 + + def _compute_log_alpha(self, emissions, mask, run_backwards): + # emissions: (seq_length, batch_size, num_tags) + # mask: (seq_length, batch_size) + + assert emissions.dim() == 3 and mask.dim() == 2 + assert emissions.size()[:2] == mask.size() + assert emissions.size(2) == self.crf_layer.num_tags + assert all(mask[0].data) + + seq_length = emissions.size(0) + mask = mask.float() + broadcast_transitions = self.crf_layer.transitions.unsqueeze(0) # (1, num_tags, num_tags) + emissions_broadcast = emissions.unsqueeze(2) + seq_iterator = range(1, seq_length) + + if run_backwards: + # running backwards, so transpose + broadcast_transitions = broadcast_transitions.transpose(1, 2) # (1, num_tags, num_tags) + emissions_broadcast = emissions_broadcast.transpose(2, 3) + + # the starting probability is end_transitions if running backwards + log_prob = [self.crf_layer.end_transitions.expand(emissions.size(1), -1)] + + # iterate over the sequence backwards + seq_iterator = reversed(seq_iterator) + else: + # Start transition score and first emission + log_prob = [emissions[0] + self.crf_layer.start_transitions.view(1, -1)] + + for i in seq_iterator: + # Broadcast log_prob over all possible next tags + broadcast_log_prob = log_prob[-1].unsqueeze(2) # (batch_size, num_tags, 1) + # Sum current log probability, transition, and emission scores + score = broadcast_log_prob + broadcast_transitions + emissions_broadcast[ + i] # (batch_size, num_tags, num_tags) + # Sum over all possible current tags, but we're in log prob space, so a sum + # becomes a log-sum-exp + score = self._log_sum_exp(score, dim=1) + # Set log_prob to the score if this timestep is valid (mask == 1), otherwise + # copy the prior value + log_prob.append(score * mask[i].unsqueeze(1) + + log_prob[-1] * (1. - mask[i]).unsqueeze(1)) + + if run_backwards: + log_prob.reverse() + + return torch.stack(log_prob) + + def compute_marginal_probabilities(self, inputs, mask): + # SWITCHING FOR BATCH FIRST DEFAULT + dense_W = torch.tile(self.W, dims=(mask.shape[0], 1)) + out_1 = torch.addmm(self.b, inputs, dense_W) + emissions = out_1.reshape((mask.shape[0], -1, self.num_classes)) + emissions = emissions.transpose(0, 1) + mask = mask.transpose(0, 1) + alpha = self._compute_log_alpha(emissions, mask, run_backwards=False) + beta = self._compute_log_alpha(emissions, mask, run_backwards=True) + z = torch.logsumexp(alpha[alpha.size(0) - 1] + self.crf_layer.end_transitions, dim=1) + prob = alpha + beta - z.view(1, -1, 1) + return torch.exp(prob).transpose(0, 1) + + @staticmethod + def _log_sum_exp(tensor_input, dim): + # Find the max value along `dim` + offset, _ = tensor_input.max(dim) + # Make offset broadcastable + broadcast_offset = offset.unsqueeze(dim) + # Perform log-sum-exp safely + safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor_input - broadcast_offset), dim)) + # Add offset back + return offset + safe_log_sum_exp + + def set_params(self, **params): + self.feat_type = params.get('feat_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_type']).lower() + self.feat_num = params.get('feat_num', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_num']) + self.stratify = params.get('stratify', DEFAULT_PYTORCH_CRF_ER_CONFIG['stratify']) + self.drop_input = params.get('drop_input', DEFAULT_PYTORCH_CRF_ER_CONFIG['drop_input']) + self.train_batch_size = params.get('train_batch_size', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_batch_size']) + self.patience = params.get('patience', DEFAULT_PYTORCH_CRF_ER_CONFIG['patience']) + self.epochs = params.get('epochs', DEFAULT_PYTORCH_CRF_ER_CONFIG['epochs']) + self.train_dev_split = params.get('train_dev_split', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_dev_split']) + self.optimizer_type = params.get('optimizer_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['optimizer_type']).lower() + self.random_state = params.get('random_state', randint(1, 10000001)) + + self.validate_params() + + logger.debug(f"Random state is {self.random_state}") + if self.feat_type == "dict" and "feat_num" in params: + logger.warning( + "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting", ) + + def fit(self, X, y): + self.set_random_states() + self.encoder = Encoder(feature_extractor=self.feat_type, num_feats=self.feat_num) + stratify_tuples = None + if self.stratify: + stratify_tuples = [tuple(sorted(list(set(label)))) for label in y] + # If we have a label class that is only 1 in number, duplicate it, otherwise train_test_split throws error when using stratify! + cnt = Counter(stratify_tuples) + last_one = -1 + while cnt.most_common()[last_one][-1] < 2: + lone_idx = stratify_tuples.index(cnt.most_common()[last_one][0]) + stratify_tuples.append(cnt.most_common()[last_one][0]) + y.append(copy(y[lone_idx])) + X.append(copy(X[lone_idx])) + last_one -= 1 + train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.train_dev_split, + stratify=stratify_tuples, random_state=self.random_state) + + train_inputs, encoded_train_labels, train_seq_lens = self.encoder.get_tensor_data(train_X, train_y, fit=True) + train_dataset = TaggerDataset(train_inputs, train_seq_lens, encoded_train_labels) + + dev_inputs, encoded_dev_labels, dev_seq_lens = self.encoder.get_tensor_data(dev_X, dev_y, fit=False) + dev_dataset = TaggerDataset(dev_inputs, dev_seq_lens, encoded_dev_labels) + + train_dataloader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True, + collate_fn=custom_collate) + + dev_dataloader = DataLoader(dev_dataset, batch_size=512, shuffle=True, collate_fn=custom_collate) + + best_dev_score, best_dev_epoch = -np.inf, -1 + _patience_counter = 0 + + self.build_params(self.encoder.num_feats, self.encoder.num_classes) + if self.optimizer_type == "sgd": + self.optimizer = optim.SGD(self.parameters(), lr=0.01, momentum=0.9, nesterov=True, weight_decay=1e-5) + elif self.optimizer_type == "adam": + self.optimizer = optim.Adam(self.parameters(), weight_decay=1e-5) + + for epoch in range(self.epochs): + if _patience_counter >= self.patience: + break + self.train_one_epoch(train_dataloader, epoch) + dev_f1_score = self.run_predictions(dev_dataloader, calc_f1=True) + dev_f1_score = np.round(dev_f1_score, decimals=3) + + if dev_f1_score <= best_dev_score: + _patience_counter += 1 + else: + _patience_counter = 0 + best_dev_score, best_dev_epoch = dev_f1_score, epoch + torch.save(self.state_dict(), self.best_model_save_path) + + def train_one_epoch(self, train_dataloader, epoch, verbose=False): + # TODO: Remove verbose option + pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader)) if verbose else enumerate( + train_dataloader) + self.train() + train_loss = 0 + for batch_idx, (inputs, mask, labels) in pbar: + self.optimizer.zero_grad() + loss = self.forward(inputs, labels, mask, drop_input=self.drop_input) + train_loss += loss.item() + loss.backward() + self.optimizer.step() + if verbose: + pbar.set_description(f"Epoch:{epoch}, Batch: {batch_idx} Mean Loss:{train_loss / (batch_idx + 1)}") + + def run_predictions(self, dataloader, calc_f1=False): + self.eval() + predictions = [] + targets = [] + with torch.no_grad(): + for inputs, *mask_and_labels in dataloader: + if calc_f1: + mask, labels = mask_and_labels + targets.extend(torch.masked_select(labels, mask).tolist()) + else: + mask = mask_and_labels.pop() + preds = self.forward(inputs, None, mask) + predictions.extend([x for lst in preds for x in lst] if calc_f1 else preds) + if calc_f1: + dev_score = f1_score(targets, predictions, average='weighted') + logger.debug(f"Weighted F-1: {dev_score}") + return dev_score + else: + return predictions + + def predict_marginals(self, X): + self.load_state_dict(torch.load(self.best_model_save_path)) + inputs, seq_lens = self.encoder.get_tensor_data(X) + torch_dataset = TaggerDataset(inputs, seq_lens) + + dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=custom_collate) + marginals_dict = [] + self.eval() + with torch.no_grad(): + for inputs, mask in dataloader: + probs = self.compute_marginal_probabilities(inputs, mask).tolist() + mask = mask.tolist() + # If anyone has any suggestions on a cleaner way to do this, I am all ears! + marginals_dict.extend([[dict(zip(self.encoder.classes, token_probs)) \ + for (token_probs, valid_token) in zip(seq, mask_seq) if valid_token] \ + for seq, mask_seq in zip(probs, mask)]) + + return marginals_dict + + def predict(self, X): + self.load_state_dict(torch.load(self.best_model_save_path)) + inputs, seq_lens = self.encoder.get_tensor_data(X) + torch_dataset = TaggerDataset(inputs, seq_lens) + + dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=custom_collate) + preds = self.run_predictions(dataloader, calc_f1=False) + return [self.encoder.label_encoder.inverse_transform(x).tolist() for x in preds] From 52754b538a40404849c2b330b506a03e0903ac20 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 29 Apr 2022 12:15:49 -0700 Subject: [PATCH 02/20] Fixed requirements --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 1921196a1..30ad385d9 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,9 @@ # As a stop gap, we'll pin our version of markupsafe to that last 2.0 version. Longer term we need to update # our flask dependency, and likely move it to an extra "markupsafe==2.0.1", + 'torch~=1.7.0; python_version>="3.6"', + 'pytorch-crf~=0.7.2' + ] setup_requirements = ["pytest-runner~=2.11", "setuptools>=36"] @@ -124,10 +127,7 @@ 'pygit2>=1.5.0,<1.7; python_version < "3.7"', "dvc>=1.8.1" ], - "torch": [ - 'torch~=1.7.0; python_version>="3.6"', - 'pytorch-crf~=0.7.2' - ], + "transformers": [ # huggingface-transformers 'transformers~=4.15.0; python_version>="3.6"', ], From 2d9aa7d85b1af07a909d39e8ced532209c31a599 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 29 Apr 2022 12:24:38 -0700 Subject: [PATCH 03/20] Fixed requirements again --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 30ad385d9..fe7abfe80 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,8 @@ # our flask dependency, and likely move it to an extra "markupsafe==2.0.1", 'torch~=1.7.0; python_version>="3.6"', - 'pytorch-crf~=0.7.2' + 'pytorch-crf~=0.7.2', + 'torch-sparse' ] @@ -127,7 +128,6 @@ 'pygit2>=1.5.0,<1.7; python_version < "3.7"', "dvc>=1.8.1" ], - "transformers": [ # huggingface-transformers 'transformers~=4.15.0; python_version>="3.6"', ], From d818046f9c5a55d0b87dbc7625c66e59f181648f Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 29 Apr 2022 16:50:55 -0700 Subject: [PATCH 04/20] Fixed requirement by removing requirement causing problem --- mindmeld/models/taggers/pytorch_crf.py | 50 ++++++++++++++++++++++---- setup.py | 2 -- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index 9ba351212..8386f3277 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -16,7 +16,6 @@ from sklearn.preprocessing import LabelEncoder from torch import optim from torch.utils.data import Dataset, DataLoader -from torch_sparse import cat, tensor from torchcrf import CRF from tqdm import tqdm @@ -58,6 +57,44 @@ def __getitem__(self, index): return self.input[index], mask +def custom_coo_cat(tensors): + assert len(tensors) > 0 + + rows = [] + cols = [] + values = [] + sparse_sizes = [0, 0] + + nnz = 0 + for tensor in tensors: + tensor = tensor.coalesce() + row, col = tensor.indices()[0], tensor.indices()[1] + if row is not None: + rows.append(row + sparse_sizes[0]) + + cols.append(col + sparse_sizes[1]) + + value = tensor.values() + if value is not None: + values.append(value) + + sparse_sizes[0] += tensor.shape[0] + sparse_sizes[1] += tensor.shape[1] + nnz += tensor._nnz() + + row = None + if len(rows) == len(tensors): + row = torch.cat(rows, dim=0) + + col = torch.cat(cols, dim=0) + + value = None + if len(values) == len(tensors): + value = torch.cat(values, dim=0) + + return torch.sparse_coo_tensor(indices=torch.stack([row, col]), values=value, size=sparse_sizes).coalesce() + + def init_weights(m): if type(m) == nn.Linear: torch.nn.init.xavier_normal_(m.weight) @@ -67,11 +104,10 @@ def init_weights(m): def custom_collate(sequence): if len(sequence[0]) == 3: sparse_mats, masks, labels = zip(*sequence) - return cat(sparse_mats, dim=(0, 1)).to_torch_sparse_coo_tensor().coalesce(), torch.stack(masks), torch.stack( - labels) + return custom_coo_cat(sparse_mats), torch.stack(masks), torch.stack(labels) elif len(sequence[0]) == 2: sparse_mats, masks = zip(*sequence) - return cat(sparse_mats, dim=(0, 1)).to_torch_sparse_coo_tensor().coalesce(), torch.stack(masks) + return custom_coo_cat(sparse_mats), torch.stack(masks) class Encoder: @@ -109,8 +145,10 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False): for i, (x, y) in enumerate(zip_longest(feat_dicts, labels)): padded_x = x + [{}] * (max_seq_len - seq_lens[i]) - sparse_feat = self.feat_extractor.transform(padded_x) - sparse_feat_tensor = tensor.from_scipy(sparse_feat) + sparse_feat = self.feat_extractor.transform(padded_x).tocoo() + sparse_feat_tensor = torch.sparse_coo_tensor( + indices=torch.as_tensor(np.stack([sparse_feat.row, sparse_feat.col])), + values=torch.as_tensor(sparse_feat.data), size=sparse_feat.shape) feats.append(sparse_feat_tensor) if y: diff --git a/setup.py b/setup.py index fe7abfe80..1fe8bdcbf 100644 --- a/setup.py +++ b/setup.py @@ -46,8 +46,6 @@ "markupsafe==2.0.1", 'torch~=1.7.0; python_version>="3.6"', 'pytorch-crf~=0.7.2', - 'torch-sparse' - ] setup_requirements = ["pytest-runner~=2.11", "setuptools>=36"] From dda255adf87c5cd8b2c7292194ccd5534860c6fc Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 29 Apr 2022 17:29:41 -0700 Subject: [PATCH 05/20] Trying to make the lint happy --- mindmeld/models/taggers/pytorch_crf.py | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index 8386f3277..de048c152 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -17,7 +17,6 @@ from torch import optim from torch.utils.data import Dataset, DataLoader from torchcrf import CRF -from tqdm import tqdm from ...exceptions import MindMeldError from ...path import USER_CONFIG_DIR @@ -38,8 +37,8 @@ class TaggerDataset(Dataset): - def __init__(self, input, seq_lens, labels=None): - self.input = input + def __init__(self, inputs, seq_lens, labels=None): + self.inputs = inputs self.labels = labels self.seq_lens = seq_lens self.max_seq_length = max(seq_lens) @@ -52,9 +51,9 @@ def __getitem__(self, index): mask = torch.as_tensor(mask_list, dtype=torch.bool) if self.labels: - return self.input[index], mask, self.labels[index] + return self.inputs[index], mask, self.labels[index] else: - return self.input[index], mask + return self.inputs[index], mask def custom_coo_cat(tensors): @@ -96,7 +95,7 @@ def custom_coo_cat(tensors): def init_weights(m): - if type(m) == nn.Linear: + if isinstance(m, nn.Linear): torch.nn.init.xavier_normal_(m.weight) m.bias.data.fill_(0.01) @@ -160,9 +159,10 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False): return (feats, encoded_labels, seq_lens) if encoded_labels else (feats, seq_lens) +# pylint: disable=too-many-instance-attributes class TorchCRF(nn.Module): def __init__(self): - super(TorchCRF, self).__init__() + super().__init__() self.optimizer = None self.encoder = None self.best_model_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") @@ -180,9 +180,9 @@ def validate_params(self): elif self.feat_type not in ["hash", "dict"]: raise MindMeldError(f"Feature type {self.feat_type} not supported. Supported options are ['hash', 'dict']") elif not 0 < self.train_dev_split < 1: - raise MindMeldError(f"Train-dev split should be a value between 0 and 1.") + raise MindMeldError("Train-dev split should be a value between 0 and 1.") elif not 0 <= self.drop_input < 1: - raise MindMeldError(f"Drop Input should be a value between 0 and 1. (inclusive)") + raise MindMeldError("Drop Input should be a value between 0 and 1. (inclusive)") for x, y in zip([self.feat_num, self.train_batch_size, self.patience, self.epochs], ["Number of features", "Train Batch size", "Patience", "Number of epochs"]): @@ -298,11 +298,12 @@ def set_params(self, **params): self.validate_params() - logger.debug(f"Random state is {self.random_state}") + logger.debug("Random state is %s", self.random_state) if self.feat_type == "dict" and "feat_num" in params: logger.warning( "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting", ) + # pylint: disable=too-many-locals def fit(self, X, y): self.set_random_states() self.encoder = Encoder(feature_extractor=self.feat_type, num_feats=self.feat_num) @@ -320,10 +321,10 @@ def fit(self, X, y): last_one -= 1 train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.train_dev_split, stratify=stratify_tuples, random_state=self.random_state) - + # pylint: disable=unbalanced-tuple-unpacking train_inputs, encoded_train_labels, train_seq_lens = self.encoder.get_tensor_data(train_X, train_y, fit=True) train_dataset = TaggerDataset(train_inputs, train_seq_lens, encoded_train_labels) - + # pylint: disable=unbalanced-tuple-unpacking dev_inputs, encoded_dev_labels, dev_seq_lens = self.encoder.get_tensor_data(dev_X, dev_y, fit=False) dev_dataset = TaggerDataset(dev_inputs, dev_seq_lens, encoded_dev_labels) @@ -344,8 +345,9 @@ def fit(self, X, y): for epoch in range(self.epochs): if _patience_counter >= self.patience: break - self.train_one_epoch(train_dataloader, epoch) + self.train_one_epoch(train_dataloader) dev_f1_score = self.run_predictions(dev_dataloader, calc_f1=True) + logger.debug("Epoch %s finished. Dev F1: %s", epoch, dev_f1_score) dev_f1_score = np.round(dev_f1_score, decimals=3) if dev_f1_score <= best_dev_score: @@ -354,21 +356,20 @@ def fit(self, X, y): _patience_counter = 0 best_dev_score, best_dev_epoch = dev_f1_score, epoch torch.save(self.state_dict(), self.best_model_save_path) + logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) - def train_one_epoch(self, train_dataloader, epoch, verbose=False): - # TODO: Remove verbose option - pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader)) if verbose else enumerate( - train_dataloader) + def train_one_epoch(self, train_dataloader): self.train() train_loss = 0 - for batch_idx, (inputs, mask, labels) in pbar: + for batch_idx, (inputs, mask, labels) in enumerate(train_dataloader): self.optimizer.zero_grad() loss = self.forward(inputs, labels, mask, drop_input=self.drop_input) train_loss += loss.item() loss.backward() self.optimizer.step() - if verbose: - pbar.set_description(f"Epoch:{epoch}, Batch: {batch_idx} Mean Loss:{train_loss / (batch_idx + 1)}") + if batch_idx % 20 == 0: + logger.debug("Batch: %s Mean Loss: %s", batch_idx, + (train_loss / (batch_idx + 1))) def run_predictions(self, dataloader, calc_f1=False): self.eval() @@ -385,7 +386,6 @@ def run_predictions(self, dataloader, calc_f1=False): predictions.extend([x for lst in preds for x in lst] if calc_f1 else preds) if calc_f1: dev_score = f1_score(targets, predictions, average='weighted') - logger.debug(f"Weighted F-1: {dev_score}") return dev_score else: return predictions From 4d6a70cdbb6ed09b1893c835dad6d59b6133175a Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Tue, 3 May 2022 12:38:44 -0700 Subject: [PATCH 06/20] Updated torch version and included torch-crf in existing test --- setup.py | 2 +- tests/models/test_tagging.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 1fe8bdcbf..6b72055fd 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ # As a stop gap, we'll pin our version of markupsafe to that last 2.0 version. Longer term we need to update # our flask dependency, and likely move it to an extra "markupsafe==2.0.1", - 'torch~=1.7.0; python_version>="3.6"', + 'torch~=1.11.0; python_version>="3.6"', 'pytorch-crf~=0.7.2', ] diff --git a/tests/models/test_tagging.py b/tests/models/test_tagging.py index 085927817..aa7c68080 100644 --- a/tests/models/test_tagging.py +++ b/tests/models/test_tagging.py @@ -271,7 +271,7 @@ def test_get_boundary_counts_sequential( @pytest.mark.parametrize( "model_type,params", - [("memm", {"penalty": "l2", "C": 10000}), ("crf", {"c1": 0.01, "c2": 0.01})], + [("memm", {"penalty": "l2", "C": 10000}), ("crf", {"c1": 0.01, "c2": 0.01}), ("torch-crf", {"feat_type": "dict"})], ) def test_view_extracted_features(kwik_e_mart_nlp, model_type, params): config = { @@ -292,8 +292,8 @@ def test_view_extracted_features(kwik_e_mart_nlp, model_type, params): } er = ( kwik_e_mart_nlp.domains["store_info"] - .intents["get_store_hours"] - .entity_recognizer + .intents["get_store_hours"] + .entity_recognizer ) er.fit(**config) extracted_features = er.view_extracted_features("Main st store hours") @@ -311,6 +311,7 @@ def test_view_extracted_features(kwik_e_mart_nlp, model_type, params): [ ("Main st store hours", "memm", {"penalty": "l2", "C": 10000}), ("Main st store hours", "crf", {"c1": 0.01, "c2": 0.01}), + ("Main st store hours", "torch-crf", {"feat_type": "dict"}) ], ) def test_fetch_distribution(kwik_e_mart_nlp, query, model_type, params): @@ -332,8 +333,8 @@ def test_fetch_distribution(kwik_e_mart_nlp, query, model_type, params): } er = ( kwik_e_mart_nlp.domains["store_info"] - .intents["get_store_hours"] - .entity_recognizer + .intents["get_store_hours"] + .entity_recognizer ) er.fit(**config) processed_query = kwik_e_mart_nlp.create_query(query) @@ -372,8 +373,8 @@ def test_lstm_er_model_no_tf(kwik_e_mart_nlp): } er = ( kwik_e_mart_nlp.domains["store_info"] - .intents["get_store_hours"] - .entity_recognizer + .intents["get_store_hours"] + .entity_recognizer ) with pytest.raises(ValueError) as exc_info: er.fit(**config) @@ -411,8 +412,8 @@ def test_lstm_er_model(kwik_e_mart_nlp): } er = ( kwik_e_mart_nlp.domains["store_info"] - .intents["get_store_hours"] - .entity_recognizer + .intents["get_store_hours"] + .entity_recognizer ) er.fit(**config) response = kwik_e_mart_nlp.process("Does the 156th location open on Saturday?") From 3cd8fed7b413d995bc9e9a7de21db22ff7a09382 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Tue, 3 May 2022 12:41:18 -0700 Subject: [PATCH 07/20] Updated torch version so its compatible with 3.6 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6b72055fd..3a04d201f 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ # As a stop gap, we'll pin our version of markupsafe to that last 2.0 version. Longer term we need to update # our flask dependency, and likely move it to an extra "markupsafe==2.0.1", - 'torch~=1.11.0; python_version>="3.6"', + 'torch~=1.10.0; python_version>="3.6"', 'pytorch-crf~=0.7.2', ] From 978f33fb5e403791e041c6b9b1b0997b4e32c137 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Tue, 3 May 2022 13:30:08 -0700 Subject: [PATCH 08/20] Updated torch version in extras requirements --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3a04d201f..a701b4041 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ 'tensorflow>=1.13.1,<2.0; python_version >= "3.7"', ], "bert": [ # sentence-transformers - 'torch~=1.7.0; python_version>="3.6"', + 'torch~=1.10.0; python_version>="3.6"', 'transformers~=4.15.0; python_version>="3.6"', 'sentence-transformers~=0.3; python_version>="3.6"', # elasticsearch-py 7.14 breaks backwards compatibility with servers prior to 7.11 @@ -108,7 +108,7 @@ 'connexion>=2.7.0; python_version>="3.6"', ], "augment": [ - 'torch~=1.7.0; python_version>="3.6"', + 'torch~=1.10.0; python_version>="3.6"', 'transformers~=4.15.0; python_version>="3.6"', 'sentencepiece==0.1.91' ], From 6cb9785b8c9c6f0ad932ffec9401c37f5d7d8ba7 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Thu, 19 May 2022 15:30:54 -0700 Subject: [PATCH 09/20] Fix review comments --- mindmeld/models/taggers/pytorch_crf.py | 29 ++++++++++---------------- 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index de048c152..d80c6eb7d 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -110,7 +110,7 @@ def custom_collate(sequence): class Encoder: - def __init__(self, feature_extractor="dict", num_feats=None): + def __init__(self, feature_extractor="hash", num_feats=50000): self.feat_extractor = DictVectorizer(dtype=np.float32) if feature_extractor == "dict" else FeatureHasher( n_features=num_feats, dtype=np.float32) @@ -249,7 +249,7 @@ def _compute_log_alpha(self, emissions, mask, run_backwards): i] # (batch_size, num_tags, num_tags) # Sum over all possible current tags, but we're in log prob space, so a sum # becomes a log-sum-exp - score = self._log_sum_exp(score, dim=1) + score = torch.logsumexp(score, dim=1) # Set log_prob to the score if this timestep is valid (mask == 1), otherwise # copy the prior value log_prob.append(score * mask[i].unsqueeze(1) + @@ -273,17 +273,6 @@ def compute_marginal_probabilities(self, inputs, mask): prob = alpha + beta - z.view(1, -1, 1) return torch.exp(prob).transpose(0, 1) - @staticmethod - def _log_sum_exp(tensor_input, dim): - # Find the max value along `dim` - offset, _ = tensor_input.max(dim) - # Make offset broadcastable - broadcast_offset = offset.unsqueeze(dim) - # Perform log-sum-exp safely - safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor_input - broadcast_offset), dim)) - # Add offset back - return offset + safe_log_sum_exp - def set_params(self, **params): self.feat_type = params.get('feat_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_type']).lower() self.feat_num = params.get('feat_num', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_num']) @@ -294,11 +283,13 @@ def set_params(self, **params): self.epochs = params.get('epochs', DEFAULT_PYTORCH_CRF_ER_CONFIG['epochs']) self.train_dev_split = params.get('train_dev_split', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_dev_split']) self.optimizer_type = params.get('optimizer_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['optimizer_type']).lower() + self.verbose = (logger.getEffectiveLevel() == logging.DEBUG) self.random_state = params.get('random_state', randint(1, 10000001)) self.validate_params() - logger.debug("Random state is %s", self.random_state) + if self.verbose: + logger.debug("Random state for torch-crf is %s", self.random_state) if self.feat_type == "dict" and "feat_num" in params: logger.warning( "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting", ) @@ -347,8 +338,9 @@ def fit(self, X, y): break self.train_one_epoch(train_dataloader) dev_f1_score = self.run_predictions(dev_dataloader, calc_f1=True) - logger.debug("Epoch %s finished. Dev F1: %s", epoch, dev_f1_score) - dev_f1_score = np.round(dev_f1_score, decimals=3) + if self.verbose: + logger.debug("Epoch %s finished. Dev F1: %s", epoch, dev_f1_score) + # dev_f1_score = np.round(dev_f1_score, decimals=3) if dev_f1_score <= best_dev_score: _patience_counter += 1 @@ -356,7 +348,8 @@ def fit(self, X, y): _patience_counter = 0 best_dev_score, best_dev_epoch = dev_f1_score, epoch torch.save(self.state_dict(), self.best_model_save_path) - logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) + if self.verbose: + logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) def train_one_epoch(self, train_dataloader): self.train() @@ -367,7 +360,7 @@ def train_one_epoch(self, train_dataloader): train_loss += loss.item() loss.backward() self.optimizer.step() - if batch_idx % 20 == 0: + if batch_idx % 20 == 0 and self.verbose: logger.debug("Batch: %s Mean Loss: %s", batch_idx, (train_loss / (batch_idx + 1))) From ec112dff6ae456e228ef0872071ab823c7fbaffe Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 20 May 2022 15:37:23 -0700 Subject: [PATCH 10/20] More review comments --- mindmeld/models/taggers/crf.py | 10 ---------- mindmeld/models/taggers/pytorch_crf.py | 17 ++++++----------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index bc5b24dd0..58966db9b 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -186,16 +186,6 @@ def setup_model(self, config): class PyTorchCRF(Tagger): """A Conditional Random Fields model.""" - @staticmethod - def _predict_proba(X): - del X - pass - - @staticmethod - def load(model_path): - del model_path - pass - def fit(self, X, y): self._clf.fit(X, y) return self diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index d80c6eb7d..32490cc00 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -52,8 +52,8 @@ def __getitem__(self, index): mask = torch.as_tensor(mask_list, dtype=torch.bool) if self.labels: return self.inputs[index], mask, self.labels[index] - else: - return self.inputs[index], mask + + return self.inputs[index], mask def custom_coo_cat(tensors): @@ -283,13 +283,11 @@ def set_params(self, **params): self.epochs = params.get('epochs', DEFAULT_PYTORCH_CRF_ER_CONFIG['epochs']) self.train_dev_split = params.get('train_dev_split', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_dev_split']) self.optimizer_type = params.get('optimizer_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['optimizer_type']).lower() - self.verbose = (logger.getEffectiveLevel() == logging.DEBUG) self.random_state = params.get('random_state', randint(1, 10000001)) self.validate_params() - if self.verbose: - logger.debug("Random state for torch-crf is %s", self.random_state) + logger.debug("Random state for torch-crf is %s", self.random_state) if self.feat_type == "dict" and "feat_num" in params: logger.warning( "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting", ) @@ -338,9 +336,7 @@ def fit(self, X, y): break self.train_one_epoch(train_dataloader) dev_f1_score = self.run_predictions(dev_dataloader, calc_f1=True) - if self.verbose: - logger.debug("Epoch %s finished. Dev F1: %s", epoch, dev_f1_score) - # dev_f1_score = np.round(dev_f1_score, decimals=3) + logger.debug("Epoch %s finished. Dev F1: %s", epoch, dev_f1_score) if dev_f1_score <= best_dev_score: _patience_counter += 1 @@ -348,8 +344,7 @@ def fit(self, X, y): _patience_counter = 0 best_dev_score, best_dev_epoch = dev_f1_score, epoch torch.save(self.state_dict(), self.best_model_save_path) - if self.verbose: - logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) + logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) def train_one_epoch(self, train_dataloader): self.train() @@ -360,7 +355,7 @@ def train_one_epoch(self, train_dataloader): train_loss += loss.item() loss.backward() self.optimizer.step() - if batch_idx % 20 == 0 and self.verbose: + if batch_idx % 20 == 0: logger.debug("Batch: %s Mean Loss: %s", batch_idx, (train_loss / (batch_idx + 1))) From 94cdb688267239a1e78e31ae8f9e88198a8d9df1 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Tue, 24 May 2022 16:50:57 -0700 Subject: [PATCH 11/20] Refactored variables and renamed models --- mindmeld/models/tagger_models.py | 6 ++--- mindmeld/models/taggers/crf.py | 6 ++--- mindmeld/models/taggers/pytorch_crf.py | 35 +++++++++++++------------- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/mindmeld/models/tagger_models.py b/mindmeld/models/tagger_models.py index 7f6d29044..529638905 100644 --- a/mindmeld/models/tagger_models.py +++ b/mindmeld/models/tagger_models.py @@ -27,7 +27,7 @@ ) from .model import ModelConfig, Model, PytorchModel, AbstractModelFactory from .nn_utils import get_token_classifier_cls, TokenClassificationType -from .taggers.crf import ConditionalRandomFields, PyTorchCRF +from .taggers.crf import ConditionalRandomFields, TorchCrfTagger from .taggers.memm import MemmModel from ..exceptions import MindMeldError @@ -132,7 +132,7 @@ def _get_model_constructor(self): return { TaggerModel.MEMM_TYPE: MemmModel, TaggerModel.CRF_TYPE: ConditionalRandomFields, - TaggerModel.TORCH_CRF_TYPE: PyTorchCRF, + TaggerModel.TORCH_CRF_TYPE: TorchCrfTagger, TaggerModel.LSTM_TYPE: LstmModel, }[classifier_type] except KeyError as e: @@ -248,7 +248,7 @@ def fit(self, examples, labels, params=None): self._current_params = params else: # run cross validation to select params - if self._clf.__class__ in (LstmModel, PyTorchCRF): + if self._clf.__class__ in (LstmModel, TorchCrfTagger): raise MindMeldError(f"The {self._clf.__class__.__name__} model does not support cross-validation") _, best_params = self._fit_cv(X, y, groups) diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index 58966db9b..02f2fad3c 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -23,7 +23,7 @@ from .taggers import Tagger, extract_sequence_features from ..helpers import FileBackedList -from .pytorch_crf import TorchCRF +from .pytorch_crf import TorchCrfModel logger = logging.getLogger(__name__) @@ -183,7 +183,7 @@ def setup_model(self, config): self._feat_binner = FeatureBinner() -class PyTorchCRF(Tagger): +class TorchCrfTagger(Tagger): """A Conditional Random Fields model.""" def fit(self, X, y): @@ -191,7 +191,7 @@ def fit(self, X, y): return self def set_params(self, **parameters): - self._clf = TorchCRF() + self._clf = TorchCrfModel() self._clf.set_params(**parameters) return self diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index 32490cc00..f53e0fdcb 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -26,12 +26,12 @@ DEFAULT_PYTORCH_CRF_ER_CONFIG = { "feat_type": "hash", # ["hash", "dict"] "feat_num": 50000, - "stratify": True, + "stratify_train_val_split": True, "drop_input": 0.2, "train_batch_size": 8, "patience": 3, "epochs": 100, - "train_dev_split": 0.15, + "train_val_split": 0.15, "optimizer_type": "sgd", # ["sgd", "adam"] } @@ -56,7 +56,7 @@ def __getitem__(self, index): return self.inputs[index], mask -def custom_coo_cat(tensors): +def diag_concat_coo_tensors(tensors): assert len(tensors) > 0 rows = [] @@ -100,13 +100,13 @@ def init_weights(m): m.bias.data.fill_(0.01) -def custom_collate(sequence): +def collate_tensors_and_masks(sequence): if len(sequence[0]) == 3: sparse_mats, masks, labels = zip(*sequence) - return custom_coo_cat(sparse_mats), torch.stack(masks), torch.stack(labels) + return diag_concat_coo_tensors(sparse_mats), torch.stack(masks), torch.stack(labels) elif len(sequence[0]) == 2: sparse_mats, masks = zip(*sequence) - return custom_coo_cat(sparse_mats), torch.stack(masks) + return diag_concat_coo_tensors(sparse_mats), torch.stack(masks) class Encoder: @@ -119,7 +119,7 @@ def __init__(self, feature_extractor="hash", num_feats=50000): self.num_classes = None self.classes = None self.num_feats = num_feats - self.fit_done = False + self.ready = False def get_tensor_data(self, feat_dicts, labels=None, fit=False): if labels is None: @@ -135,7 +135,7 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False): self.classes = self.label_encoder.classes_ self.num_classes = len(self.label_encoder.classes_) - self.fit_done = True + self.ready = True feats = [] encoded_labels = [] seq_lens = [len(x) for x in feat_dicts] @@ -160,7 +160,7 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False): # pylint: disable=too-many-instance-attributes -class TorchCRF(nn.Module): +class TorchCrfModel(nn.Module): def __init__(self): super().__init__() self.optimizer = None @@ -179,7 +179,7 @@ def validate_params(self): f"Optimizer type {self.optimizer_type} not supported. Supported options are ['sgd', 'adam']") elif self.feat_type not in ["hash", "dict"]: raise MindMeldError(f"Feature type {self.feat_type} not supported. Supported options are ['hash', 'dict']") - elif not 0 < self.train_dev_split < 1: + elif not 0 < self.train_val_split < 1: raise MindMeldError("Train-dev split should be a value between 0 and 1.") elif not 0 <= self.drop_input < 1: raise MindMeldError("Drop Input should be a value between 0 and 1. (inclusive)") @@ -276,12 +276,13 @@ def compute_marginal_probabilities(self, inputs, mask): def set_params(self, **params): self.feat_type = params.get('feat_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_type']).lower() self.feat_num = params.get('feat_num', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_num']) - self.stratify = params.get('stratify', DEFAULT_PYTORCH_CRF_ER_CONFIG['stratify']) + self.stratify = params.get('stratify_train_val_split', + DEFAULT_PYTORCH_CRF_ER_CONFIG['stratify_train_val_split']) self.drop_input = params.get('drop_input', DEFAULT_PYTORCH_CRF_ER_CONFIG['drop_input']) self.train_batch_size = params.get('train_batch_size', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_batch_size']) self.patience = params.get('patience', DEFAULT_PYTORCH_CRF_ER_CONFIG['patience']) self.epochs = params.get('epochs', DEFAULT_PYTORCH_CRF_ER_CONFIG['epochs']) - self.train_dev_split = params.get('train_dev_split', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_dev_split']) + self.train_val_split = params.get('train_val_split', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_val_split']) self.optimizer_type = params.get('optimizer_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['optimizer_type']).lower() self.random_state = params.get('random_state', randint(1, 10000001)) @@ -308,7 +309,7 @@ def fit(self, X, y): y.append(copy(y[lone_idx])) X.append(copy(X[lone_idx])) last_one -= 1 - train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.train_dev_split, + train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.train_val_split, stratify=stratify_tuples, random_state=self.random_state) # pylint: disable=unbalanced-tuple-unpacking train_inputs, encoded_train_labels, train_seq_lens = self.encoder.get_tensor_data(train_X, train_y, fit=True) @@ -318,9 +319,9 @@ def fit(self, X, y): dev_dataset = TaggerDataset(dev_inputs, dev_seq_lens, encoded_dev_labels) train_dataloader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True, - collate_fn=custom_collate) + collate_fn=collate_tensors_and_masks) - dev_dataloader = DataLoader(dev_dataset, batch_size=512, shuffle=True, collate_fn=custom_collate) + dev_dataloader = DataLoader(dev_dataset, batch_size=512, shuffle=True, collate_fn=collate_tensors_and_masks) best_dev_score, best_dev_epoch = -np.inf, -1 _patience_counter = 0 @@ -383,7 +384,7 @@ def predict_marginals(self, X): inputs, seq_lens = self.encoder.get_tensor_data(X) torch_dataset = TaggerDataset(inputs, seq_lens) - dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=custom_collate) + dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=collate_tensors_and_masks) marginals_dict = [] self.eval() with torch.no_grad(): @@ -402,6 +403,6 @@ def predict(self, X): inputs, seq_lens = self.encoder.get_tensor_data(X) torch_dataset = TaggerDataset(inputs, seq_lens) - dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=custom_collate) + dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=collate_tensors_and_masks) preds = self.run_predictions(dataloader, calc_f1=False) return [self.encoder.label_encoder.inverse_transform(x).tolist() for x in preds] From a90545170efd7015fea5c94d974a2831a9a406d9 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Tue, 24 May 2022 16:54:47 -0700 Subject: [PATCH 12/20] Added test for hash config --- tests/models/test_tagging.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/test_tagging.py b/tests/models/test_tagging.py index aa7c68080..497e7f863 100644 --- a/tests/models/test_tagging.py +++ b/tests/models/test_tagging.py @@ -271,7 +271,8 @@ def test_get_boundary_counts_sequential( @pytest.mark.parametrize( "model_type,params", - [("memm", {"penalty": "l2", "C": 10000}), ("crf", {"c1": 0.01, "c2": 0.01}), ("torch-crf", {"feat_type": "dict"})], + [("memm", {"penalty": "l2", "C": 10000}), ("crf", {"c1": 0.01, "c2": 0.01}), ("torch-crf", {"feat_type": "dict"}), + ("torch-crf", {"feat_type": "hash"})], ) def test_view_extracted_features(kwik_e_mart_nlp, model_type, params): config = { @@ -311,7 +312,8 @@ def test_view_extracted_features(kwik_e_mart_nlp, model_type, params): [ ("Main st store hours", "memm", {"penalty": "l2", "C": 10000}), ("Main st store hours", "crf", {"c1": 0.01, "c2": 0.01}), - ("Main st store hours", "torch-crf", {"feat_type": "dict"}) + ("Main st store hours", "torch-crf", {"feat_type": "dict"}), + ("Main st store hours", "torch-crf", {"feat_type": "hash"}) ], ) def test_fetch_distribution(kwik_e_mart_nlp, query, model_type, params): From 2ea742e8b6bce4a5822aa60df04fac8318e83e96 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Wed, 8 Jun 2022 01:08:35 -0700 Subject: [PATCH 13/20] Major refactoring and review comments --- mindmeld/models/helpers.py | 8 +- mindmeld/models/tagger_models.py | 5 +- mindmeld/models/taggers/crf.py | 13 +- mindmeld/models/taggers/pytorch_crf.py | 270 ++++++++++++++----------- mindmeld/models/taggers/taggers.py | 2 +- 5 files changed, 179 insertions(+), 119 deletions(-) diff --git a/mindmeld/models/helpers.py b/mindmeld/models/helpers.py index c57c80f48..be8b97c3f 100644 --- a/mindmeld/models/helpers.py +++ b/mindmeld/models/helpers.py @@ -18,6 +18,7 @@ import os import re from tempfile import mkstemp +import numpy as np import nltk from sklearn.metrics import make_scorer @@ -534,6 +535,11 @@ def add_resource(func): return add_resource +def np_encoder(object): + if isinstance(object, np.generic): + return object.item() + + class FileBackedList: """ FileBackedList implements an interface for simple list use cases @@ -553,7 +559,7 @@ def __len__(self): def append(self, line): if self.file_handle is None: self.file_handle = open(self.filename, "w") - self.file_handle.write(json.dumps(line)) + self.file_handle.write(json.dumps(line, default=np_encoder)) self.file_handle.write("\n") self.num_lines += 1 diff --git a/mindmeld/models/tagger_models.py b/mindmeld/models/tagger_models.py index 529638905..2b1e24479 100644 --- a/mindmeld/models/tagger_models.py +++ b/mindmeld/models/tagger_models.py @@ -79,6 +79,7 @@ class TaggerModel(Model): # for default model scoring types ACCURACY_SCORING = "accuracy" SEQ_ACCURACY_SCORING = "seq_accuracy" + # TODO: Rename torch-crf to crf implementation SEQUENCE_MODELS = ["crf", "torch-crf"] DEFAULT_FEATURES = { @@ -248,8 +249,8 @@ def fit(self, examples, labels, params=None): self._current_params = params else: # run cross validation to select params - if self._clf.__class__ in (LstmModel, TorchCrfTagger): - raise MindMeldError(f"The {self._clf.__class__.__name__} model does not support cross-validation") + if isinstance(self._clf, (LstmModel, TorchCrfTagger)): + raise MindMeldError(f"The {type(self._clf).__name__} model does not support cross-validation") _, best_params = self._fit_cv(X, y, groups) self._clf = self._fit(X, y, best_params) diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index 02f2fad3c..beafa02f0 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -270,7 +270,12 @@ def extract_features(self, (list of list of str): features in CRF suite format """ # Extract features and classes - feats = [] if in_memory else FileBackedList() + feats = [] + # The FileBackedList now has support for indexing but it still loads the list + # eventually into memory cause of the scikit-learn train_test_split function. + if not in_memory: + logger.warning("PyTorch CRF does not currently support STORE_CRF_FEATURES_IN_MEMORY. This may be fixed in " + "a future release.") for _, example in enumerate(examples): feats.append(self.extract_example_features(example, config, resources)) X = self._preprocess_data(feats, fit) @@ -297,7 +302,7 @@ def _preprocess_data(self, X, fit=False): """Converts data into formats of CRF suite. Args: - X (list of dict): features of an example + X (list of list of dict): features of an example fit (bool, optional): True if processing data at fit time, false for predict time. Returns: @@ -316,6 +321,10 @@ def _preprocess_data(self, X, fit=False): def setup_model(self, config): self._feat_binner = FeatureBinner() + def dump(self, path): + print() + pass + # Feature extraction for CRF diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index f53e0fdcb..1f90af394 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -4,8 +4,9 @@ import uuid from collections import Counter from copy import copy -from itertools import zip_longest +from itertools import chain from random import randint +import gc import numpy as np import torch @@ -23,18 +24,6 @@ logger = logging.getLogger(__name__) -DEFAULT_PYTORCH_CRF_ER_CONFIG = { - "feat_type": "hash", # ["hash", "dict"] - "feat_num": 50000, - "stratify_train_val_split": True, - "drop_input": 0.2, - "train_batch_size": 8, - "patience": 3, - "epochs": 100, - "train_val_split": 0.15, - "optimizer_type": "sgd", # ["sgd", "adam"] -} - class TaggerDataset(Dataset): def __init__(self, inputs, seq_lens, labels=None): @@ -104,7 +93,7 @@ def collate_tensors_and_masks(sequence): if len(sequence[0]) == 3: sparse_mats, masks, labels = zip(*sequence) return diag_concat_coo_tensors(sparse_mats), torch.stack(masks), torch.stack(labels) - elif len(sequence[0]) == 2: + if len(sequence[0]) == 2: sparse_mats, masks = zip(*sequence) return diag_concat_coo_tensors(sparse_mats), torch.stack(masks) @@ -112,59 +101,95 @@ def collate_tensors_and_masks(sequence): class Encoder: def __init__(self, feature_extractor="hash", num_feats=50000): - self.feat_extractor = DictVectorizer(dtype=np.float32) if feature_extractor == "dict" else FeatureHasher( - n_features=num_feats, dtype=np.float32) + if feature_extractor == "dict": + self.feat_extractor = DictVectorizer(dtype=np.float32) + else: + self.feat_extractor = FeatureHasher(n_features=num_feats, dtype=np.float32) + self.label_encoder = LabelEncoder() - self.feat_extract_type = feature_extractor self.num_classes = None self.classes = None self.num_feats = num_feats - self.ready = False - def get_tensor_data(self, feat_dicts, labels=None, fit=False): - if labels is None: - labels = [] + def get_input_tensors(self, feat_dicts, fit=False): if fit: - if self.feat_extract_type == "dict": + if isinstance(self.feat_extractor, DictVectorizer): comb_dict_list = [x for seq in feat_dicts for x in seq] self.feat_extractor.fit(comb_dict_list) self.num_feats = len(self.feat_extractor.get_feature_names()) - if labels: - self.label_encoder.fit([x for l in labels for x in l]) - self.pad_index = len(self.label_encoder.classes_) - 1 - self.classes = self.label_encoder.classes_ - self.num_classes = len(self.label_encoder.classes_) - - self.ready = True - feats = [] - encoded_labels = [] - seq_lens = [len(x) for x in feat_dicts] + + pass + + def get_padded_transformed_tensors(self, inputs_or_labels, seq_lens, is_label): + if inputs_or_labels is None: + return None + encoded_tensors = [] max_seq_len = max(seq_lens) - for i, (x, y) in enumerate(zip_longest(feat_dicts, labels)): + for i, x in enumerate(inputs_or_labels): + if not is_label: + padded_encoded_tensor = self.encode_padded_input(seq_lens[i], max_seq_len, x) + else: + padded_encoded_tensor = self.encode_padded_label(seq_lens[i], max_seq_len, x) + encoded_tensors.append(padded_encoded_tensor) + return encoded_tensors + + def get_tensor_data(self, feat_dicts, labels=None, fit=False): + if fit: + if isinstance(self.feat_extractor, DictVectorizer): + flattened_feat_dicts = list(chain.from_iterable(feat_dicts)) + self.feat_extractor.fit(flattened_feat_dicts) + self.num_feats = len(self.feat_extractor.get_feature_names()) + if labels is not None: + flattened_labels = list(chain.from_iterable(labels)) + self.label_encoder.fit(flattened_labels) + self.classes, self.num_classes = self.label_encoder.classes_, len(self.label_encoder.classes_) + + seq_lens = [len(x) for x in feat_dicts] + + encoded_tensor_inputs = self.get_padded_transformed_tensors(feat_dicts, seq_lens, is_label=False) + encoded_tensor_labels = self.get_padded_transformed_tensors(labels, seq_lens, is_label=True) - padded_x = x + [{}] * (max_seq_len - seq_lens[i]) - sparse_feat = self.feat_extractor.transform(padded_x).tocoo() - sparse_feat_tensor = torch.sparse_coo_tensor( - indices=torch.as_tensor(np.stack([sparse_feat.row, sparse_feat.col])), - values=torch.as_tensor(sparse_feat.data), size=sparse_feat.shape) - feats.append(sparse_feat_tensor) + return encoded_tensor_inputs, seq_lens, encoded_tensor_labels - if y: - transformed_label = self.label_encoder.transform(y) - transformed_label = np.pad(transformed_label, pad_width=(0, max_seq_len - seq_lens[i]), - constant_values=self.pad_index) - label_tensor = torch.as_tensor(transformed_label, dtype=torch.long) - encoded_labels.append(label_tensor) - return (feats, encoded_labels, seq_lens) if encoded_labels else (feats, seq_lens) + def encode_padded_input(self, current_seq_len, max_seq_len, x): + padded_x = x + [{}] * (max_seq_len - current_seq_len) + sparse_feat = self.feat_extractor.transform(padded_x).tocoo() + sparse_feat_tensor = torch.sparse_coo_tensor( + indices=torch.as_tensor(np.stack([sparse_feat.row, sparse_feat.col])), + values=torch.as_tensor(sparse_feat.data), size=sparse_feat.shape) + return sparse_feat_tensor + + def encode_padded_label(self, current_seq_len, max_seq_len, y): + transformed_label = self.label_encoder.transform(y) + transformed_label = np.pad(transformed_label, pad_width=(0, max_seq_len - current_seq_len), + constant_values=(self.num_classes - 1)) + label_tensor = torch.as_tensor(transformed_label, dtype=torch.long) + return label_tensor # pylint: disable=too-many-instance-attributes class TorchCrfModel(nn.Module): def __init__(self): super().__init__() - self.optimizer = None + self.optim = None self.encoder = None + self.W = None + self.b = None + self.crf_layer = None + self.num_classes = None + + self.feat_type = None + self.feat_num = None + self.stratify_train_val_split = None + self.drop_input = None + self.batch_size = None + self.patience = None + self.number_of_epochs = None + self.dev_split_ratio = None + self.optimizer = None + self.random_state = None + self.best_model_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") os.makedirs(os.path.dirname(self.best_model_save_path), exist_ok=True) @@ -174,20 +199,19 @@ def set_random_states(self): np.random.seed(self.random_state + 2) def validate_params(self): - if self.optimizer_type not in ["sgd", "adam"]: + if self.optimizer not in ["sgd", "adam"]: raise MindMeldError( f"Optimizer type {self.optimizer_type} not supported. Supported options are ['sgd', 'adam']") - elif self.feat_type not in ["hash", "dict"]: + if self.feat_type not in ["hash", "dict"]: raise MindMeldError(f"Feature type {self.feat_type} not supported. Supported options are ['hash', 'dict']") - elif not 0 < self.train_val_split < 1: + if not 0 < self.dev_split_ratio < 1: raise MindMeldError("Train-dev split should be a value between 0 and 1.") - elif not 0 <= self.drop_input < 1: - raise MindMeldError("Drop Input should be a value between 0 and 1. (inclusive)") - - for x, y in zip([self.feat_num, self.train_batch_size, self.patience, self.epochs], - ["Number of features", "Train Batch size", "Patience", "Number of epochs"]): - if not isinstance(x, int): - raise MindMeldError(f"{y} should be am integer value.") + if not 0 <= self.drop_input < 1: + raise MindMeldError("Drop Input should be a value between 0 (inclusive) and 1.") + if not isinstance(self.patience, int): + raise MindMeldError("Patience should be an integer value.") + if not isinstance(self.number_of_epochs, int): + raise MindMeldError("Number of epochs should be am integer value.") def build_params(self, num_features, num_classes): self.W = nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(size=(num_features, num_classes))), @@ -273,66 +297,68 @@ def compute_marginal_probabilities(self, inputs, mask): prob = alpha + beta - z.view(1, -1, 1) return torch.exp(prob).transpose(0, 1) - def set_params(self, **params): - self.feat_type = params.get('feat_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_type']).lower() - self.feat_num = params.get('feat_num', DEFAULT_PYTORCH_CRF_ER_CONFIG['feat_num']) - self.stratify = params.get('stratify_train_val_split', - DEFAULT_PYTORCH_CRF_ER_CONFIG['stratify_train_val_split']) - self.drop_input = params.get('drop_input', DEFAULT_PYTORCH_CRF_ER_CONFIG['drop_input']) - self.train_batch_size = params.get('train_batch_size', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_batch_size']) - self.patience = params.get('patience', DEFAULT_PYTORCH_CRF_ER_CONFIG['patience']) - self.epochs = params.get('epochs', DEFAULT_PYTORCH_CRF_ER_CONFIG['epochs']) - self.train_val_split = params.get('train_val_split', DEFAULT_PYTORCH_CRF_ER_CONFIG['train_val_split']) - self.optimizer_type = params.get('optimizer_type', DEFAULT_PYTORCH_CRF_ER_CONFIG['optimizer_type']).lower() - self.random_state = params.get('random_state', randint(1, 10000001)) + def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split=True, drop_input=0.2, batch_size=8, + patience=3, number_of_epochs=100, dev_split_ratio=0.2, optimizer="sgd", + random_state=randint(1, 10000001)): + + self.feat_type = feat_type # ["hash", "dict"] + self.feat_num = feat_num + self.stratify_train_val_split = stratify_train_val_split + self.drop_input = drop_input + self.batch_size = batch_size + self.patience = patience + self.number_of_epochs = number_of_epochs + self.dev_split_ratio = dev_split_ratio + self.optimizer = optimizer # ["sgd", "adam"] + self.random_state = random_state self.validate_params() logger.debug("Random state for torch-crf is %s", self.random_state) - if self.feat_type == "dict" and "feat_num" in params: + if self.feat_type == "dict": logger.warning( - "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting", ) + "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting") + + def get_dataloader(self, X, y, is_train): + tensor_inputs, input_seq_lens, tensor_labels = self.encoder.get_tensor_data(X, y, fit=is_train) + tensor_dataset = TaggerDataset(tensor_inputs, input_seq_lens, tensor_labels) + torch_dataloader = DataLoader(tensor_dataset, batch_size=self.batch_size if is_train else 512, shuffle=is_train, + collate_fn=collate_tensors_and_masks) + return torch_dataloader - # pylint: disable=too-many-locals def fit(self, X, y): self.set_random_states() self.encoder = Encoder(feature_extractor=self.feat_type, num_feats=self.feat_num) stratify_tuples = None - if self.stratify: - stratify_tuples = [tuple(sorted(list(set(label)))) for label in y] - # If we have a label class that is only 1 in number, duplicate it, otherwise train_test_split throws error when using stratify! - cnt = Counter(stratify_tuples) - last_one = -1 - while cnt.most_common()[last_one][-1] < 2: - lone_idx = stratify_tuples.index(cnt.most_common()[last_one][0]) - stratify_tuples.append(cnt.most_common()[last_one][0]) - y.append(copy(y[lone_idx])) - X.append(copy(X[lone_idx])) - last_one -= 1 - train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.train_val_split, + if self.stratify_train_val_split: + stratify_tuples = self.stratify_input(X, y) + + # TODO: Rewrite our own train_test_split function to handle FileBackedList and avoid duplicating unique labels + train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.dev_split_ratio, stratify=stratify_tuples, random_state=self.random_state) - # pylint: disable=unbalanced-tuple-unpacking - train_inputs, encoded_train_labels, train_seq_lens = self.encoder.get_tensor_data(train_X, train_y, fit=True) - train_dataset = TaggerDataset(train_inputs, train_seq_lens, encoded_train_labels) - # pylint: disable=unbalanced-tuple-unpacking - dev_inputs, encoded_dev_labels, dev_seq_lens = self.encoder.get_tensor_data(dev_X, dev_y, fit=False) - dev_dataset = TaggerDataset(dev_inputs, dev_seq_lens, encoded_dev_labels) - - train_dataloader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True, - collate_fn=collate_tensors_and_masks) - dev_dataloader = DataLoader(dev_dataset, batch_size=512, shuffle=True, collate_fn=collate_tensors_and_masks) + train_dataloader = self.get_dataloader(train_X, train_y, is_train=True) + dev_dataloader = self.get_dataloader(dev_X, dev_y, is_train=False) - best_dev_score, best_dev_epoch = -np.inf, -1 - _patience_counter = 0 + # desperate attempt to save some memory + del X, y, train_X, train_y, dev_X, dev_y, stratify_tuples + gc.collect() self.build_params(self.encoder.num_feats, self.encoder.num_classes) - if self.optimizer_type == "sgd": - self.optimizer = optim.SGD(self.parameters(), lr=0.01, momentum=0.9, nesterov=True, weight_decay=1e-5) - elif self.optimizer_type == "adam": - self.optimizer = optim.Adam(self.parameters(), weight_decay=1e-5) - for epoch in range(self.epochs): + if self.optimizer == "sgd": + self.optim = optim.SGD(self.parameters(), lr=0.01, momentum=0.9, nesterov=True, weight_decay=1e-5) + if self.optimizer == "adam": + self.optim = optim.Adam(self.parameters(), weight_decay=1e-5) + + self.training_loop(train_dataloader, dev_dataloader) + + def training_loop(self, train_dataloader, dev_dataloader): + + best_dev_score, best_dev_epoch = -np.inf, -1 + _patience_counter = 0 + + for epoch in range(self.number_of_epochs): if _patience_counter >= self.patience: break self.train_one_epoch(train_dataloader) @@ -347,15 +373,32 @@ def fit(self, X, y): torch.save(self.state_dict(), self.best_model_save_path) logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) + def stratify_input(self, X, y): + def get_unique_tuple(label): + return tuple(sorted(list(set(label)))) + + stratify_tuples = [get_unique_tuple(label) for label in y] + # If we have a label class that is only 1 in number, duplicate it, otherwise train_test_split throws error when using stratify! + cnt = Counter(stratify_tuples) + + for label, count in cnt.most_common()[::-1]: + if count > 1: + break + idx = stratify_tuples.index(label) + X.append(copy(X[idx])) + y.append(copy(y[idx])) + stratify_tuples.append(label) + return stratify_tuples + def train_one_epoch(self, train_dataloader): self.train() train_loss = 0 for batch_idx, (inputs, mask, labels) in enumerate(train_dataloader): - self.optimizer.zero_grad() + self.optim.zero_grad() loss = self.forward(inputs, labels, mask, drop_input=self.drop_input) train_loss += loss.item() loss.backward() - self.optimizer.step() + self.optim.step() if batch_idx % 20 == 0: logger.debug("Batch: %s Mean Loss: %s", batch_idx, (train_loss / (batch_idx + 1))) @@ -381,28 +424,29 @@ def run_predictions(self, dataloader, calc_f1=False): def predict_marginals(self, X): self.load_state_dict(torch.load(self.best_model_save_path)) - inputs, seq_lens = self.encoder.get_tensor_data(X) - torch_dataset = TaggerDataset(inputs, seq_lens) - dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=collate_tensors_and_masks) + dataloader = self.get_dataloader(X, None, is_train=False) marginals_dict = [] self.eval() with torch.no_grad(): for inputs, mask in dataloader: probs = self.compute_marginal_probabilities(inputs, mask).tolist() mask = mask.tolist() - # If anyone has any suggestions on a cleaner way to do this, I am all ears! - marginals_dict.extend([[dict(zip(self.encoder.classes, token_probs)) \ - for (token_probs, valid_token) in zip(seq, mask_seq) if valid_token] \ - for seq, mask_seq in zip(probs, mask)]) + + # This is basically to create a nested list-dict structure in which we have the probability values + # for each token for each sequence. + for seq, mask_seq in zip(probs, mask): + one_seq_list = [] + for (token_probs, valid_token) in zip(seq, mask_seq): + if valid_token: + one_seq_list.append(dict(zip(self.encoder.classes, token_probs))) + marginals_dict.append(one_seq_list) return marginals_dict def predict(self, X): self.load_state_dict(torch.load(self.best_model_save_path)) - inputs, seq_lens = self.encoder.get_tensor_data(X) - torch_dataset = TaggerDataset(inputs, seq_lens) + dataloader = self.get_dataloader(X, None, is_train=False) - dataloader = DataLoader(torch_dataset, batch_size=512, shuffle=False, collate_fn=collate_tensors_and_masks) preds = self.run_predictions(dataloader, calc_f1=False) return [self.encoder.label_encoder.inverse_transform(x).tolist() for x in preds] diff --git a/mindmeld/models/taggers/taggers.py b/mindmeld/models/taggers/taggers.py index 66c8821b5..4f58eb725 100644 --- a/mindmeld/models/taggers/taggers.py +++ b/mindmeld/models/taggers/taggers.py @@ -43,7 +43,7 @@ class Tagger: """A class for all sequence tagger models implemented in house. - It is importent to follow this interface exactly when implementing a new model so that your + It is important to follow this interface exactly when implementing a new model so that your model is configured and trained as expected in the MindMeld pipeline. Note that this follows the sklearn estimator interface so that GridSearchCV can be used on our sequence models. """ From aee34ae708432c4f2d5da81d1eb61c0aaf691f4f Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Thu, 9 Jun 2022 12:14:26 -0700 Subject: [PATCH 14/20] I think I fixed the generated model issue and some other review comments and lint looks good --- mindmeld/models/helpers.py | 6 +-- mindmeld/models/tagger_models.py | 28 ++++++---- mindmeld/models/taggers/crf.py | 9 +++- mindmeld/models/taggers/pytorch_crf.py | 71 ++++++++++++++++---------- 4 files changed, 73 insertions(+), 41 deletions(-) diff --git a/mindmeld/models/helpers.py b/mindmeld/models/helpers.py index be8b97c3f..cff744c3d 100644 --- a/mindmeld/models/helpers.py +++ b/mindmeld/models/helpers.py @@ -535,9 +535,9 @@ def add_resource(func): return add_resource -def np_encoder(object): - if isinstance(object, np.generic): - return object.item() +def np_encoder(val): + if isinstance(val, np.generic): + return val.item() class FileBackedList: diff --git a/mindmeld/models/tagger_models.py b/mindmeld/models/tagger_models.py index 2b1e24479..2253c811a 100644 --- a/mindmeld/models/tagger_models.py +++ b/mindmeld/models/tagger_models.py @@ -79,7 +79,7 @@ class TaggerModel(Model): # for default model scoring types ACCURACY_SCORING = "accuracy" SEQ_ACCURACY_SCORING = "seq_accuracy" - # TODO: Rename torch-crf to crf implementation + # TODO: Rename torch-crf to crf implementation. Created https://github.com/cisco/mindmeld/issues/416 for this. SEQUENCE_MODELS = ["crf", "torch-crf"] DEFAULT_FEATURES = { @@ -249,7 +249,7 @@ def fit(self, examples, labels, params=None): self._current_params = params else: # run cross validation to select params - if isinstance(self._clf, (LstmModel, TorchCrfTagger)): + if isinstance(self._clf, (TorchCrfTagger, LstmModel)): raise MindMeldError(f"The {type(self._clf).__name__} model does not support cross-validation") _, best_params = self._fit_cv(X, y, groups) @@ -395,13 +395,21 @@ def _dump(self, path): }) else: # underneath tagger dump for LSTM model, returned `model_dir` is None for MEMM & CRF - self._clf.dump(path) - metadata.update({ - "current_params": self._current_params, - "label_encoder": self._label_encoder, - "no_entities": self._no_entities, - "model_config": self.config - }) + if isinstance(self._clf, TorchCrfTagger): + self._clf.dump(path) + metadata.update({ + "model": self, + "model_type": "torch-crf" + }) + elif isinstance(self._clf, LstmModel): + self._clf.dump(path) + metadata.update({ + "current_params": self._current_params, + "label_encoder": self._label_encoder, + "no_entities": self._no_entities, + "model_config": self.config, + "model_type": "lstm" + }) # dump model metadata os.makedirs(os.path.dirname(path), exist_ok=True) @@ -424,7 +432,7 @@ def load(cls, path): # If model is serializable, it can be loaded and used as-is. But if not serializable, # it means we need to create an instance and load necessary details for it to be used. - if not is_serializable: + if not is_serializable and metadata.get('model_type') == 'lstm': model = cls(metadata["model_config"]) # misc resources load diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index beafa02f0..9b7aa7883 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -273,6 +273,7 @@ def extract_features(self, feats = [] # The FileBackedList now has support for indexing but it still loads the list # eventually into memory cause of the scikit-learn train_test_split function. + # Created https://github.com/cisco/mindmeld/issues/417 for this. if not in_memory: logger.warning("PyTorch CRF does not currently support STORE_CRF_FEATURES_IN_MEMORY. This may be fixed in " "a future release.") @@ -321,9 +322,13 @@ def _preprocess_data(self, X, fit=False): def setup_model(self, config): self._feat_binner = FeatureBinner() + @property + def is_serializable(self): + return False + def dump(self, path): - print() - pass + best_model_save_path = os.path.join(os.path.split(path)[0], "best_crf_wts.pt") + self._clf.save_best_weights_path(best_model_save_path) # Feature extraction for CRF diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index 1f90af394..b75df4367 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -7,6 +7,7 @@ from itertools import chain from random import randint import gc +import shutil import numpy as np import torch @@ -89,6 +90,24 @@ def init_weights(m): m.bias.data.fill_(0.01) +def stratify_input(X, y): + def get_unique_tuple(label): + return tuple(sorted(list(set(label)))) + + stratify_tuples = [get_unique_tuple(label) for label in y] + # If we have a label class that is only 1 in number, duplicate it, otherwise train_test_split throws error when using stratify! + cnt = Counter(stratify_tuples) + + for label, count in cnt.most_common()[::-1]: + if count > 1: + break + idx = stratify_tuples.index(label) + X.append(copy(X[idx])) + y.append(copy(y[idx])) + stratify_tuples.append(label) + return X, y, stratify_tuples + + def collate_tensors_and_masks(sequence): if len(sequence[0]) == 3: sparse_mats, masks, labels = zip(*sequence) @@ -145,6 +164,7 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False): self.label_encoder.fit(flattened_labels) self.classes, self.num_classes = self.label_encoder.classes_, len(self.label_encoder.classes_) + # number of tokens in each example seq_lens = [len(x) for x in feat_dicts] encoded_tensor_inputs = self.get_padded_transformed_tensors(feat_dicts, seq_lens, is_label=False) @@ -190,14 +210,24 @@ def __init__(self): self.optimizer = None self.random_state = None - self.best_model_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") - os.makedirs(os.path.dirname(self.best_model_save_path), exist_ok=True) + self.best_model_save_path = None + self.tmp_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") + os.makedirs(os.path.dirname(self.tmp_save_path), exist_ok=True) def set_random_states(self): torch.manual_seed(self.random_state) random.seed(self.random_state + 1) np.random.seed(self.random_state + 2) + def save_best_weights_path(self, path): + self.best_model_save_path = path + if not os.path.exists(self.best_model_save_path): + best_weights = torch.load(self.tmp_save_path) + torch.save(best_weights, self.best_model_save_path) + shutil.rmtree(os.path.dirname(self.tmp_save_path)) + # else: + # raise MindMeldError("CRF weights not saved. Please re-train model from scratch.") + def validate_params(self): if self.optimizer not in ["sgd", "adam"]: raise MindMeldError( @@ -297,9 +327,10 @@ def compute_marginal_probabilities(self, inputs, mask): prob = alpha + beta - z.view(1, -1, 1) return torch.exp(prob).transpose(0, 1) + # pylint: disable=too-many-arguments def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split=True, drop_input=0.2, batch_size=8, patience=3, number_of_epochs=100, dev_split_ratio=0.2, optimizer="sgd", - random_state=randint(1, 10000001)): + random_state=None): self.feat_type = feat_type # ["hash", "dict"] self.feat_num = feat_num @@ -310,7 +341,7 @@ def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split= self.number_of_epochs = number_of_epochs self.dev_split_ratio = dev_split_ratio self.optimizer = optimizer # ["sgd", "adam"] - self.random_state = random_state + self.random_state = random_state or randint(1, 10000001) self.validate_params() @@ -331,7 +362,7 @@ def fit(self, X, y): self.encoder = Encoder(feature_extractor=self.feat_type, num_feats=self.feat_num) stratify_tuples = None if self.stratify_train_val_split: - stratify_tuples = self.stratify_input(X, y) + X, y, stratify_tuples = stratify_input(X, y) # TODO: Rewrite our own train_test_split function to handle FileBackedList and avoid duplicating unique labels train_X, dev_X, train_y, dev_y = train_test_split(X, y, test_size=self.dev_split_ratio, @@ -370,26 +401,9 @@ def training_loop(self, train_dataloader, dev_dataloader): else: _patience_counter = 0 best_dev_score, best_dev_epoch = dev_f1_score, epoch - torch.save(self.state_dict(), self.best_model_save_path) + torch.save(self.state_dict(), self.tmp_save_path) logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) - def stratify_input(self, X, y): - def get_unique_tuple(label): - return tuple(sorted(list(set(label)))) - - stratify_tuples = [get_unique_tuple(label) for label in y] - # If we have a label class that is only 1 in number, duplicate it, otherwise train_test_split throws error when using stratify! - cnt = Counter(stratify_tuples) - - for label, count in cnt.most_common()[::-1]: - if count > 1: - break - idx = stratify_tuples.index(label) - X.append(copy(X[idx])) - y.append(copy(y[idx])) - stratify_tuples.append(label) - return stratify_tuples - def train_one_epoch(self, train_dataloader): self.train() train_loss = 0 @@ -423,8 +437,10 @@ def run_predictions(self, dataloader, calc_f1=False): return predictions def predict_marginals(self, X): - self.load_state_dict(torch.load(self.best_model_save_path)) - + if self.best_model_save_path: + self.load_state_dict(torch.load(self.best_model_save_path)) + else: + self.load_state_dict(torch.load(self.tmp_save_path)) dataloader = self.get_dataloader(X, None, is_train=False) marginals_dict = [] self.eval() @@ -445,7 +461,10 @@ def predict_marginals(self, X): return marginals_dict def predict(self, X): - self.load_state_dict(torch.load(self.best_model_save_path)) + if self.best_model_save_path: + self.load_state_dict(torch.load(self.best_model_save_path)) + else: + self.load_state_dict(torch.load(self.tmp_save_path)) dataloader = self.get_dataloader(X, None, is_train=False) preds = self.run_predictions(dataloader, calc_f1=False) From b314ad0a3cafec2a87a6c49b9a11aca322e1645a Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Wed, 15 Jun 2022 14:02:25 -0700 Subject: [PATCH 15/20] Fix isinstance error --- mindmeld/models/tagger_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mindmeld/models/tagger_models.py b/mindmeld/models/tagger_models.py index 2253c811a..053164afb 100644 --- a/mindmeld/models/tagger_models.py +++ b/mindmeld/models/tagger_models.py @@ -248,8 +248,9 @@ def fit(self, examples, labels, params=None): self._clf = self._fit(X, y, params) self._current_params = params else: + non_supported_classes = (TorchCrfTagger, LstmModel) if LstmModel is not None else TorchCrfTagger # run cross validation to select params - if isinstance(self._clf, (TorchCrfTagger, LstmModel)): + if isinstance(self._clf, non_supported_classes): raise MindMeldError(f"The {type(self._clf).__name__} model does not support cross-validation") _, best_params = self._fit_cv(X, y, groups) From 1d458495bcc48cd7f11589a9d37af2b36c36a539 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Thu, 16 Jun 2022 15:32:21 -0700 Subject: [PATCH 16/20] Updated doscstrings and fixed all ending comments --- mindmeld/models/taggers/crf.py | 1 + mindmeld/models/taggers/pytorch_crf.py | 246 ++++++++++++++++++++++--- 2 files changed, 217 insertions(+), 30 deletions(-) diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index 9b7aa7883..90c0f01df 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -190,6 +190,7 @@ def fit(self, X, y): self._clf.fit(X, y) return self + # TODO: Refactor to move initialization into init() or setup_model() def set_params(self, **parameters): self._clf = TorchCrfModel() self._clf.set_params(**parameters) diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index b75df4367..c69bf03e7 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -27,6 +27,8 @@ class TaggerDataset(Dataset): + """PyTorch Dataset class used to handle tagger inputs, labels and mask""" + def __init__(self, inputs, seq_lens, labels=None): self.inputs = inputs self.labels = labels @@ -47,7 +49,15 @@ def __getitem__(self, index): def diag_concat_coo_tensors(tensors): + """Concatenates sparse PyTorch COO tensors diagonally so that they can processed in batches. + + Args: + tensors (tuple of torch.Tensor): Tuple of sparse COO tensors to diagonally concatenate. + Returns: + stacked_tensor (torch.Tensor): A single sparse COO tensor that acts as a single batch. + """ assert len(tensors) > 0 + logger.debug("Concatenating %s tensors into a diagonal representation.", len(tensors)) rows = [] cols = [] @@ -84,13 +94,21 @@ def diag_concat_coo_tensors(tensors): return torch.sparse_coo_tensor(indices=torch.stack([row, col]), values=value, size=sparse_sizes).coalesce() -def init_weights(m): - if isinstance(m, nn.Linear): - torch.nn.init.xavier_normal_(m.weight) - m.bias.data.fill_(0.01) - - def stratify_input(X, y): + """Gets the input and labels ready for stratification into train and dev data. Stratification is done + based on the presence of unique labels for each sequence. It also duplicates the unique samples across input and labels + to ensure that it doesn't fail with scikit-learn's train_test_split. + + Args: + X (list): Generally a list of feature vectors, one for each training example + y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld + entity objects) + Returns: + str_X (list): List of feature vectors, ready for stratification. + str_y (list): List of labels, ready for stratification. + stratify_tuples (list): Unique label for each example which will be the value used for stratification.. + """ + def get_unique_tuple(label): return tuple(sorted(list(set(label)))) @@ -109,6 +127,13 @@ def get_unique_tuple(label): def collate_tensors_and_masks(sequence): + """Custom collate function that ensures proper batching of sparse tensors, labels and masks. + + Args: + sequence (list of tuples): Each tuple contains one input tensor, one mask tensor and one label tensor. + Returns: + Batched representation of input, label and mask sequences. + """ if len(sequence[0]) == 3: sparse_mats, masks, labels = zip(*sequence) return diag_concat_coo_tensors(sparse_mats), torch.stack(masks), torch.stack(labels) @@ -118,6 +143,16 @@ def collate_tensors_and_masks(sequence): class Encoder: + """Encoder class that is responsible for the feature extraction and label encoding for the PyTorch model. + + Args: + X (list): Generally a list of feature vectors, one for each training example + y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld + entity objects) + Returns: + self + """ + def __init__(self, feature_extractor="hash", num_feats=50000): if feature_extractor == "dict": @@ -130,16 +165,16 @@ def __init__(self, feature_extractor="hash", num_feats=50000): self.classes = None self.num_feats = num_feats - def get_input_tensors(self, feat_dicts, fit=False): - if fit: - if isinstance(self.feat_extractor, DictVectorizer): - comb_dict_list = [x for seq in feat_dicts for x in seq] - self.feat_extractor.fit(comb_dict_list) - self.num_feats = len(self.feat_extractor.get_feature_names()) - - pass - def get_padded_transformed_tensors(self, inputs_or_labels, seq_lens, is_label): + """Returns the encoded and padded sparse tensor representations of the inputs/labels. + + Args: + X (list): Generally a list of feature vectors, one for each training example + y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld + entity objects) + Returns: + encoded_tensors (list of torch.Tensor): PyTorch tensor representation of padded input sequence/labels. + """ if inputs_or_labels is None: return None encoded_tensors = [] @@ -154,6 +189,19 @@ def get_padded_transformed_tensors(self, inputs_or_labels, seq_lens, is_label): return encoded_tensors def get_tensor_data(self, feat_dicts, labels=None, fit=False): + """Gets the feature dicts and labels transformed into padded PyTorch sparse tensor data. + + Args: + feat_dicts (list of list of dicts): Generally a list of feature vectors, one for each training example + y (list of lists): A list of classification labels + fit (bool): Flag to whether fit the Feature Extractor or Label Encoder. + Returns: + encoded_tensor_inputs (list of torch.Tensor): list of Sparse COO tensor representation of + encoded padded input sequence. + seq_lens (list of ints): List of actual length of each sequence. + encoded_tensor_labels (list of torch.Tensor): list of tensors representations of encoded + padded label sequence. + """ if fit: if isinstance(self.feat_extractor, DictVectorizer): flattened_feat_dicts = list(chain.from_iterable(feat_dicts)) @@ -173,6 +221,16 @@ def get_tensor_data(self, feat_dicts, labels=None, fit=False): return encoded_tensor_inputs, seq_lens, encoded_tensor_labels def encode_padded_input(self, current_seq_len, max_seq_len, x): + """Pads the input sequence feature vectors to the max sequence length and returns the sparse + torch tensor representation. + + Args: + current_seq_len (int): Number of tokens in the current example sequence. + max_seq_len (int): Max number of tokens in an example sequence in the current dataset. + x (list of dicts): List of feature vectors, one for each token in the example sequence. + Returns: + sparse_feat_tensor (torch.Tensor): Sparse COO tensor representation of padded input sequence + """ padded_x = x + [{}] * (max_seq_len - current_seq_len) sparse_feat = self.feat_extractor.transform(padded_x).tocoo() sparse_feat_tensor = torch.sparse_coo_tensor( @@ -181,6 +239,16 @@ def encode_padded_input(self, current_seq_len, max_seq_len, x): return sparse_feat_tensor def encode_padded_label(self, current_seq_len, max_seq_len, y): + """Pads the label sequences to the max sequence length and returns the + torch tensor representation. + + Args: + X (list): Generally a list of feature vectors, one for each training example + y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld + entity objects) + Returns: + label_tensor (torch.Tensor): PyTorch tensor representation of padded label sequence + """ transformed_label = self.label_encoder.transform(y) transformed_label = np.pad(transformed_label, pad_width=(0, max_seq_len - current_seq_len), constant_values=(self.num_classes - 1)) @@ -190,6 +258,8 @@ def encode_padded_label(self, current_seq_len, max_seq_len, y): # pylint: disable=too-many-instance-attributes class TorchCrfModel(nn.Module): + """PyTorch Model Class for Conditional Random Fields""" + def __init__(self): super().__init__() self.optim = None @@ -211,24 +281,33 @@ def __init__(self): self.random_state = None self.best_model_save_path = None + self.ready = False self.tmp_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") os.makedirs(os.path.dirname(self.tmp_save_path), exist_ok=True) def set_random_states(self): + """Sets the random seeds across all libraries used for deterministic output.""" torch.manual_seed(self.random_state) random.seed(self.random_state + 1) np.random.seed(self.random_state + 2) def save_best_weights_path(self, path): + """Saves the best weights of the model to a path in the .generated folder. + + Args: + path (str): Path to save the best model weights. + """ self.best_model_save_path = path if not os.path.exists(self.best_model_save_path): - best_weights = torch.load(self.tmp_save_path) - torch.save(best_weights, self.best_model_save_path) - shutil.rmtree(os.path.dirname(self.tmp_save_path)) - # else: - # raise MindMeldError("CRF weights not saved. Please re-train model from scratch.") + if os.path.exists(self.tmp_save_path): + best_weights = torch.load(self.tmp_save_path) + torch.save(best_weights, self.best_model_save_path) + shutil.rmtree(os.path.dirname(self.tmp_save_path)) + else: + raise MindMeldError("CRF weights not saved. Please re-train model from scratch.") def validate_params(self): + """Validate the argument values saved into the CRF model. """ if self.optimizer not in ["sgd", "adam"]: raise MindMeldError( f"Optimizer type {self.optimizer_type} not supported. Supported options are ['sgd', 'adam']") @@ -244,15 +323,32 @@ def validate_params(self): raise MindMeldError("Number of epochs should be am integer value.") def build_params(self, num_features, num_classes): + """Sets the parameters for the layers in the PyTorch CRF model. Naming convention is kept + consistent with the CRFSuite implementation. + + Args: + num_features (int): Number of features to use in a FeatureHasher feature extractor. + num_classes (int): Number of classes in the tagging model. + """ self.W = nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(size=(num_features, num_classes))), requires_grad=True) self.b = nn.Parameter(torch.nn.init.constant_(torch.empty(size=(num_classes,)), val=0.01), requires_grad=True) self.crf_layer = CRF(num_classes, batch_first=True) - self.crf_layer.apply(init_weights) self.num_classes = num_classes def forward(self, inputs, targets, mask, drop_input=0.0): + """The forward pass of the PyTorch CRF model. Returns the predictions or loss depending on whether + labels are passed or not. + + Args: + inputs (torch.Tensor): Batch of input tensors to pass through the model. + targets (torch.Tensor): Batch of label tensors. + mask (torch.Tensor) : Batch of mask tensors to account for padded inputs. + drop_input (float): Percentage of features to drop from the input. + Returns: + loss (torch.Tensor or list): Loss from training or predictions for input sequence. + """ if drop_input: dp_mask = (torch.FloatTensor(inputs.values().size()).uniform_() > drop_input) inputs.values()[:] = inputs.values() * dp_mask @@ -264,9 +360,17 @@ def forward(self, inputs, targets, mask, drop_input=0.0): loss = - self.crf_layer(crf_input, targets, mask=mask) return loss - # The below implementation is borrowed from https://github.com/kmkurn/pytorch-crf/pull/37 - def _compute_log_alpha(self, emissions, mask, run_backwards): + """Function used to calculate the alpha and beta probabilities of each token/tag probability. + Implementation is borrowed from https://github.com/kmkurn/pytorch-crf/pull/37. + + Args: + emissions (torch.Tensor): Emission probabilities of batched input sequence. + mask (torch.Tensor): Batch of mask tensors to account for padded inputs. + run_backwards (bool): Flag to decide whether to compute alpha or beta porbabilities + Returns: + log_prob (torch.Tensor): alpha or beta log probabilities of input batch. + """ # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) @@ -315,6 +419,15 @@ def _compute_log_alpha(self, emissions, mask, run_backwards): return torch.stack(log_prob) def compute_marginal_probabilities(self, inputs, mask): + """Function used to calculate the marginal probabilities of each token per tag. + Implementation is borrowed from https://github.com/kmkurn/pytorch-crf/pull/37. + + Args: + inputs (torch.Tensor): Batch of padded input tensors. + mask (torch.Tensor): Batch of mask tensors to account for padded inputs. + Returns: + marginal probabilities for every tag for each token for every sequence. + """ # SWITCHING FOR BATCH FIRST DEFAULT dense_W = torch.tile(self.W, dims=(mask.shape[0], 1)) out_1 = torch.addmm(self.b, inputs, dense_W) @@ -329,8 +442,23 @@ def compute_marginal_probabilities(self, inputs, mask): # pylint: disable=too-many-arguments def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split=True, drop_input=0.2, batch_size=8, - patience=3, number_of_epochs=100, dev_split_ratio=0.2, optimizer="sgd", + number_of_epochs=100, patience=3, dev_split_ratio=0.2, optimizer="sgd", random_state=None): + """Set the parameters for the PyTorch CRF model and also validates the parameters. + + Args: + feat_type (str): The type of feature extractor. Supported options are 'dict' and 'hash'. + feat_num (int): The number of features to be used by the FeatureHasher. Is not supported with the DictVectorizer + stratify_train_val_split (bool): Flag to check whether inputs should be stratified during train-dev split. + drop_input (float): The percentage at which to apply a dropout to the input features. + batch_size (int): Training batch size for the model. + number_of_epochs (int): The number of epochs (passes over the training data) to train the model for. + patience (int): Number of epochs to wait for before stopping training if dev score does not improve. + dev_split_ratio (float): Percentage of training data to be used for validation. + optimizer (str): Type of optimizer used for the model. Supported options are 'sgd' and 'adam'. + random_state (int): Integer value to set random seeds for deterministic output. + + """ self.feat_type = feat_type # ["hash", "dict"] self.feat_num = feat_num @@ -351,6 +479,17 @@ def set_params(self, feat_type="hash", feat_num=50000, stratify_train_val_split= "WARNING: Number of features is compatible with only `hash` feature type. This value is ignored with `dict` setting") def get_dataloader(self, X, y, is_train): + """Creates and returns the PyTorch dataloader instance for the training/test data. + + Args: + X (list of list of dicts): Generally a list of feature vectors, one for each training example + y (list of lists or None): A list of classification labels (encoded by the label_encoder, NOT MindMeld + entity objects) + is_train (bool): Whether the dataloader returned is going to be used for training. + Returns: + torch_dataloader (torch.utils.data.dataloader.DataLoader): returns PyTorch dataloader object that can be + used to iterate across the data. + """ tensor_inputs, input_seq_lens, tensor_labels = self.encoder.get_tensor_data(X, y, fit=is_train) tensor_dataset = TaggerDataset(tensor_inputs, input_seq_lens, tensor_labels) torch_dataloader = DataLoader(tensor_dataset, batch_size=self.batch_size if is_train else 512, shuffle=is_train, @@ -358,6 +497,13 @@ def get_dataloader(self, X, y, is_train): return torch_dataloader def fit(self, X, y): + """Trains the entire PyTorch CRF model. + + Args: + X (list of list of dicts): Generally a list of feature vectors, one for each training example + y (list of lists): A list of classification labels (encoded by the label_encoder, NOT MindMeld + entity objects) + """ self.set_random_states() self.encoder = Encoder(feature_extractor=self.feat_type, num_feats=self.feat_num) stratify_tuples = None @@ -383,8 +529,15 @@ def fit(self, X, y): self.optim = optim.Adam(self.parameters(), weight_decay=1e-5) self.training_loop(train_dataloader, dev_dataloader) + self.ready = True def training_loop(self, train_dataloader, dev_dataloader): + """Contains the training loop process where we train the model for specified number of epochs. + + Args: + train_dataloader (torch.utils.data.dataloader.DataLoader): Dataloader for training data + dev_dataloader (torch.utils.data.dataloader.DataLoader): Dataloader for validation data + """ best_dev_score, best_dev_epoch = -np.inf, -1 _patience_counter = 0 @@ -405,6 +558,11 @@ def training_loop(self, train_dataloader, dev_dataloader): logger.debug("Model weights saved for best dev epoch %s.", best_dev_epoch) def train_one_epoch(self, train_dataloader): + """Contains the training code for one epoch. + + Args: + train_dataloader (torch.utils.data.dataloader.DataLoader): Dataloader for training data + """ self.train() train_loss = 0 for batch_idx, (inputs, mask, labels) in enumerate(train_dataloader): @@ -418,6 +576,14 @@ def train_one_epoch(self, train_dataloader): (train_loss / (batch_idx + 1))) def run_predictions(self, dataloader, calc_f1=False): + """Get predictions for the data by running a inference pass of the model. + + Args: + dataloader (torch.utils.data.dataloader.DataLoader): Dataloader for test/validation data + calc_f1 (bool): Flag to return dev f1 score or return predictions for each token. + Returns: + Dev F1 score or predictions for each token in a sequence. + """ self.eval() predictions = [] targets = [] @@ -437,10 +603,20 @@ def run_predictions(self, dataloader, calc_f1=False): return predictions def predict_marginals(self, X): - if self.best_model_save_path: - self.load_state_dict(torch.load(self.best_model_save_path)) + """Get marginal probabilites for each tag per token for each sequence. + + Args: + X (list of list of dicts): Feature vectors for data to predict marginal probabilities on. + Returns: + marginals_dict (list of list of dicts): Returns the probability of every tag for each token in a sequence. + """ + if self.ready: + if self.best_model_save_path: + self.load_state_dict(torch.load(self.best_model_save_path)) + else: + self.load_state_dict(torch.load(self.tmp_save_path)) else: - self.load_state_dict(torch.load(self.tmp_save_path)) + raise MindMeldError("PyTorch-CRF Model does not seem to be trained. Train before running predictions.") dataloader = self.get_dataloader(X, None, is_train=False) marginals_dict = [] self.eval() @@ -461,10 +637,20 @@ def predict_marginals(self, X): return marginals_dict def predict(self, X): - if self.best_model_save_path: - self.load_state_dict(torch.load(self.best_model_save_path)) + """Gets predicted labels for the data. + + Args: + X (list of list of dicts): Feature vectors for data to predict labels on. + Returns: + preds (list of lists): Predictions for each token in each sequence. + """ + if self.ready: + if self.best_model_save_path: + self.load_state_dict(torch.load(self.best_model_save_path)) + else: + self.load_state_dict(torch.load(self.tmp_save_path)) else: - self.load_state_dict(torch.load(self.tmp_save_path)) + raise MindMeldError("PyTorch-CRF Model does not seem to be trained. Train before running predictions.") dataloader = self.get_dataloader(X, None, is_train=False) preds = self.run_predictions(dataloader, calc_f1=False) From 92aa39a0ad81371e3076b22171eae653d0d64794 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Thu, 16 Jun 2022 19:56:42 -0700 Subject: [PATCH 17/20] Changed the way we store temp weights but found out a bug with incremental timestamp --- mindmeld/models/taggers/pytorch_crf.py | 46 ++++++++++---------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index c69bf03e7..b3e4e34a4 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -1,13 +1,11 @@ import logging import os import random -import uuid from collections import Counter from copy import copy from itertools import chain from random import randint import gc -import shutil import numpy as np import torch @@ -19,9 +17,9 @@ from torch import optim from torch.utils.data import Dataset, DataLoader from torchcrf import CRF +from tempfile import gettempdir from ...exceptions import MindMeldError -from ...path import USER_CONFIG_DIR logger = logging.getLogger(__name__) @@ -143,15 +141,7 @@ def collate_tensors_and_masks(sequence): class Encoder: - """Encoder class that is responsible for the feature extraction and label encoding for the PyTorch model. - - Args: - X (list): Generally a list of feature vectors, one for each training example - y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld - entity objects) - Returns: - self - """ + """Encoder class that is responsible for the feature extraction and label encoding for the PyTorch model.""" def __init__(self, feature_extractor="hash", num_feats=50000): @@ -169,9 +159,9 @@ def get_padded_transformed_tensors(self, inputs_or_labels, seq_lens, is_label): """Returns the encoded and padded sparse tensor representations of the inputs/labels. Args: - X (list): Generally a list of feature vectors, one for each training example - y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld - entity objects) + inputs_or_labels (list of list of dicts): Generally a list of feature vectors, one for each training example + seq_lens (list): A list of number of tokens in each sequence + is_label (bool): Flag to indicate whether we are encoding input features or labels. Returns: encoded_tensors (list of torch.Tensor): PyTorch tensor representation of padded input sequence/labels. """ @@ -243,9 +233,9 @@ def encode_padded_label(self, current_seq_len, max_seq_len, y): torch tensor representation. Args: - X (list): Generally a list of feature vectors, one for each training example - y (list): A list of classification labels (encoded by the label_encoder, NOT MindMeld - entity objects) + current_seq_len (int): Number of tokens in the current example sequence. + max_seq_len (int): Max number of tokens in an example sequence in the current dataset. + y (list of dicts): List of labels, one for each token in the example sequence. Returns: label_tensor (torch.Tensor): PyTorch tensor representation of padded label sequence """ @@ -282,8 +272,8 @@ def __init__(self): self.best_model_save_path = None self.ready = False - self.tmp_save_path = os.path.join(USER_CONFIG_DIR, "tmp", str(uuid.uuid4()), "best_crf_model.pt") - os.makedirs(os.path.dirname(self.tmp_save_path), exist_ok=True) + self.tmp_save_path = os.path.join(gettempdir(), "best_crf_wts.pt") + # os.makedirs(os.path.dirname(self.tmp_save_path), exist_ok=True) def set_random_states(self): """Sets the random seeds across all libraries used for deterministic output.""" @@ -298,13 +288,11 @@ def save_best_weights_path(self, path): path (str): Path to save the best model weights. """ self.best_model_save_path = path - if not os.path.exists(self.best_model_save_path): - if os.path.exists(self.tmp_save_path): - best_weights = torch.load(self.tmp_save_path) - torch.save(best_weights, self.best_model_save_path) - shutil.rmtree(os.path.dirname(self.tmp_save_path)) - else: - raise MindMeldError("CRF weights not saved. Please re-train model from scratch.") + if os.path.exists(self.tmp_save_path): + best_weights = torch.load(self.tmp_save_path) + torch.save(best_weights, self.best_model_save_path) + else: + raise MindMeldError("CRF weights not saved. Please re-train model from scratch.") def validate_params(self): """Validate the argument values saved into the CRF model. """ @@ -343,7 +331,7 @@ def forward(self, inputs, targets, mask, drop_input=0.0): Args: inputs (torch.Tensor): Batch of input tensors to pass through the model. - targets (torch.Tensor): Batch of label tensors. + targets (torch.Tensor or None): Batch of label tensors. mask (torch.Tensor) : Batch of mask tensors to account for padded inputs. drop_input (float): Percentage of features to drop from the input. Returns: @@ -367,7 +355,7 @@ def _compute_log_alpha(self, emissions, mask, run_backwards): Args: emissions (torch.Tensor): Emission probabilities of batched input sequence. mask (torch.Tensor): Batch of mask tensors to account for padded inputs. - run_backwards (bool): Flag to decide whether to compute alpha or beta porbabilities + run_backwards (bool): Flag to decide whether to compute alpha or beta probabilities. Returns: log_prob (torch.Tensor): alpha or beta log probabilities of input batch. """ From 84ae3166a377971562d0297f07a25dc10561632d Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Wed, 29 Jun 2022 12:00:16 +0530 Subject: [PATCH 18/20] Fixed np encoder and updated docs --- mindmeld/models/helpers.py | 2 ++ mindmeld/models/taggers/crf.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mindmeld/models/helpers.py b/mindmeld/models/helpers.py index cff744c3d..8c27f48dc 100644 --- a/mindmeld/models/helpers.py +++ b/mindmeld/models/helpers.py @@ -538,6 +538,8 @@ def add_resource(func): def np_encoder(val): if isinstance(val, np.generic): return val.item() + else: + raise TypeError(f"{type(val)} cannot be serialized by JSON.") class FileBackedList: diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index 90c0f01df..18b60183a 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -234,7 +234,7 @@ def predict_proba_distribution(self, examples, config, resources): resources (dict): Resources which may be used for this model's feature extraction Returns: - list of tuples of (mindmeld.core.QueryEntity): a list of predicted labels \ + list of list of ((list of str) and (list of float)): a list of predicted labels \ with confidence scores """ X, _, _ = self.extract_features(examples, config, resources, in_memory=True) From 8ad11816341171fab7dda14e378356782888e335 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Wed, 29 Jun 2022 14:07:39 +0530 Subject: [PATCH 19/20] Fix linting issue --- mindmeld/models/taggers/pytorch_crf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mindmeld/models/taggers/pytorch_crf.py b/mindmeld/models/taggers/pytorch_crf.py index b3e4e34a4..11f493431 100644 --- a/mindmeld/models/taggers/pytorch_crf.py +++ b/mindmeld/models/taggers/pytorch_crf.py @@ -1,3 +1,4 @@ +import gc import logging import os import random @@ -5,7 +6,7 @@ from copy import copy from itertools import chain from random import randint -import gc +from tempfile import gettempdir import numpy as np import torch @@ -17,7 +18,6 @@ from torch import optim from torch.utils.data import Dataset, DataLoader from torchcrf import CRF -from tempfile import gettempdir from ...exceptions import MindMeldError From e2d3a2ec10a2cc56c2c24662b9f7160f063cd76a Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 8 Jul 2022 14:51:12 -0700 Subject: [PATCH 20/20] Final few nits --- mindmeld/models/helpers.py | 3 +-- mindmeld/models/tagger_models.py | 3 +-- mindmeld/models/taggers/crf.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mindmeld/models/helpers.py b/mindmeld/models/helpers.py index 8c27f48dc..8d60ecb06 100644 --- a/mindmeld/models/helpers.py +++ b/mindmeld/models/helpers.py @@ -538,8 +538,7 @@ def add_resource(func): def np_encoder(val): if isinstance(val, np.generic): return val.item() - else: - raise TypeError(f"{type(val)} cannot be serialized by JSON.") + raise TypeError(f"{type(val)} cannot be serialized by JSON.") class FileBackedList: diff --git a/mindmeld/models/tagger_models.py b/mindmeld/models/tagger_models.py index 053164afb..6eb29dac6 100644 --- a/mindmeld/models/tagger_models.py +++ b/mindmeld/models/tagger_models.py @@ -396,14 +396,13 @@ def _dump(self, path): }) else: # underneath tagger dump for LSTM model, returned `model_dir` is None for MEMM & CRF + self._clf.dump(path) if isinstance(self._clf, TorchCrfTagger): - self._clf.dump(path) metadata.update({ "model": self, "model_type": "torch-crf" }) elif isinstance(self._clf, LstmModel): - self._clf.dump(path) metadata.update({ "current_params": self._current_params, "label_encoder": self._label_encoder, diff --git a/mindmeld/models/taggers/crf.py b/mindmeld/models/taggers/crf.py index 18b60183a..8abe5aa7d 100644 --- a/mindmeld/models/taggers/crf.py +++ b/mindmeld/models/taggers/crf.py @@ -245,7 +245,7 @@ def predict_proba_distribution(self, examples, config, resources): for query_index, query_seq in enumerate(seq): tags = [] preds = [] - for i in range(len(query_seq)): + for i, _ in enumerate(query_seq): tags.append(list(marginals_dict[query_index][i].keys())) preds.append(list(marginals_dict[query_index][i].values())) tag_maps.extend(tags)