diff --git a/finetune/config.py b/finetune/config.py index 8095a09f..1caa9fbf 100644 --- a/finetune/config.py +++ b/finetune/config.py @@ -116,7 +116,6 @@ class Settings(dict): :param num_layers_trained: How many layers to finetune. Specifying a value less than model's number of layers will train layers starting from model output. Defaults to `12`. :param train_embeddings: Should embedding layer be finetuned? Defaults to `True`. :param class_weights: One of 'log', 'linear', or 'sqrt'. Auto-scales gradient updates based on class frequency. Can also be a dictionary that maps from true class name to loss coefficient. Defaults to `None`. - :param oversample: Should rare classes be oversampled? Defaults to `False`. :param eval_acc: if True, calculates accuracy and writes it to the tensorboard summary files for valudation runs. :param save_dtype: specifies what precision to save model weights with. Defaults to `np.float32`. :param regression_loss: the loss to use for regression models. One of `L1` or `L2`, defaults to `L2`. @@ -244,7 +243,6 @@ def get_default_config(): # # Class Imbalance class_weights=None, - oversample=False, # # Optimization Params optimizer="AdamW", diff --git a/finetune/target_models/classifier.py b/finetune/target_models/classifier.py index 36410c45..bb4d2c22 100644 --- a/finetune/target_models/classifier.py +++ b/finetune/target_models/classifier.py @@ -4,7 +4,6 @@ import tensorflow as tf import tensorflow_addons as tfa import numpy as np -from imblearn.over_sampling import RandomOverSampler from sklearn.utils import shuffle from finetune.base import BaseModel @@ -16,21 +15,6 @@ class ClassificationPipeline(BasePipeline): - def resampling(self, Xs, Y, context=None): - if context is not None: - if self.config.oversample: - idxs, Ys, contexts = shuffle( - *RandomOverSampler().fit_sample([[i] for i in range(len(Xs))], Y, context) - ) - return [Xs[i[0]] for i in idxs], Ys, contexts - return Xs, Y, context - else: - if self.config.oversample: - idxs, Ys = shuffle( - *RandomOverSampler().fit_sample([[i] for i in range(len(Xs))], Y) - ) - return [Xs[i[0]] for i in idxs], Ys, None - return Xs, Y, None def _target_encoder(self): return OneHotLabelEncoder() diff --git a/requirements.txt b/requirements.txt index 2e06b6e0..f3899703 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ pytest>=3.6.3 h5py>=2.8.0 joblib>=0.12.0 bs4>=0.0.1 -imbalanced-learn>=0.6.0,<0.7.0 nltk>=3.2.4 regex>=2019.03.12 lxml>=4.3.3 diff --git a/tests/test_classifier.py b/tests/test_classifier.py index e2d35e4f..d9b466f3 100644 --- a/tests/test_classifier.py +++ b/tests/test_classifier.py @@ -214,16 +214,6 @@ def test_fit_predict_low_memory(self): for proba in probabilities: self.assertIsInstance(proba, dict) - def test_oversample(self): - """ - Ensure model training does not error out when oversampling is set to True - """ - - model = Classifier(**self.default_config()) - model.config.oversample = True - train_sample = self.dataset.sample(n=self.n_sample) - model.fit(train_sample.Text.values, train_sample.Target.values) - def test_class_weights(self): # testing class weights train_sample = self.dataset.sample(n=self.n_sample * 3)