From 31fe2ef551a580f013f8852bb87e13efa44fd00e Mon Sep 17 00:00:00 2001
From: benleetownsend <ben@indico.io>
Date: Thu, 18 Nov 2021 10:56:48 +0000
Subject: [PATCH] FIX: remove imblearn (#665)

* FIX: remove imblearn

* FIX: remove config option

* FIX: remove oversample test
---
 finetune/config.py                   |  2 --
 finetune/target_models/classifier.py | 16 ----------------
 requirements.txt                     |  1 -
 tests/test_classifier.py             | 10 ----------
 4 files changed, 29 deletions(-)

diff --git a/finetune/config.py b/finetune/config.py
index 8095a09f7..1caa9fbf9 100644
--- a/finetune/config.py
+++ b/finetune/config.py
@@ -116,7 +116,6 @@ class Settings(dict):
     :param num_layers_trained: How many layers to finetune.  Specifying a value less than model's number of layers will train layers starting from model output. Defaults to `12`.
     :param train_embeddings: Should embedding layer be finetuned? Defaults to `True`.
     :param class_weights: One of 'log', 'linear', or 'sqrt'. Auto-scales gradient updates based on class frequency.  Can also be a dictionary that maps from true class name to loss coefficient. Defaults to `None`.
-    :param oversample: Should rare classes be oversampled?  Defaults to `False`.
     :param eval_acc: if True, calculates accuracy and writes it to the tensorboard summary files for valudation runs.
     :param save_dtype: specifies what precision to save model weights with.  Defaults to `np.float32`.
     :param regression_loss: the loss to use for regression models. One of `L1` or `L2`, defaults to `L2`.
@@ -244,7 +243,6 @@ def get_default_config():
         #
         # Class Imbalance
         class_weights=None,
-        oversample=False,
         #
         # Optimization Params
         optimizer="AdamW",
diff --git a/finetune/target_models/classifier.py b/finetune/target_models/classifier.py
index 36410c456..bb4d2c224 100644
--- a/finetune/target_models/classifier.py
+++ b/finetune/target_models/classifier.py
@@ -4,7 +4,6 @@
 import tensorflow as tf
 import tensorflow_addons as tfa
 import numpy as np
-from imblearn.over_sampling import RandomOverSampler
 from sklearn.utils import shuffle
 
 from finetune.base import BaseModel
@@ -16,21 +15,6 @@
 
 
 class ClassificationPipeline(BasePipeline):
-    def resampling(self, Xs, Y, context=None):
-        if context is not None:
-            if self.config.oversample:
-                idxs, Ys, contexts = shuffle(
-                    *RandomOverSampler().fit_sample([[i] for i in range(len(Xs))], Y, context)
-                )
-                return [Xs[i[0]] for i in idxs], Ys, contexts
-            return Xs, Y, context
-        else:
-            if self.config.oversample:
-                idxs, Ys = shuffle(
-                    *RandomOverSampler().fit_sample([[i] for i in range(len(Xs))], Y)
-                )
-                return [Xs[i[0]] for i in idxs], Ys, None
-            return Xs, Y, None
 
     def _target_encoder(self):
         return OneHotLabelEncoder()
diff --git a/requirements.txt b/requirements.txt
index 2e06b6e03..f38997032 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ pytest>=3.6.3
 h5py>=2.8.0
 joblib>=0.12.0
 bs4>=0.0.1
-imbalanced-learn>=0.6.0,<0.7.0
 nltk>=3.2.4
 regex>=2019.03.12
 lxml>=4.3.3
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
index e2d35e4fe..d9b466f35 100644
--- a/tests/test_classifier.py
+++ b/tests/test_classifier.py
@@ -214,16 +214,6 @@ def test_fit_predict_low_memory(self):
         for proba in probabilities:
             self.assertIsInstance(proba, dict)
 
-    def test_oversample(self):
-        """
-        Ensure model training does not error out when oversampling is set to True
-        """
-
-        model = Classifier(**self.default_config())
-        model.config.oversample = True
-        train_sample = self.dataset.sample(n=self.n_sample)
-        model.fit(train_sample.Text.values, train_sample.Target.values)
-
     def test_class_weights(self):
         # testing class weights
         train_sample = self.dataset.sample(n=self.n_sample * 3)