jlgarridol · jlgarridol · Jan 31, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.4] - 2024-01-31
+
+### Added
+- Add a parameter to `artificial_ssl_dataset` to force a minimum of instances. Issue #11
+
+### Fixed
+- DeTriTraining now is vectorized and is faster than before.
+
 # [1.0.3.1] - 2023-04-01
 
 ### Changed

diff --git a/pytest.ini b/pytest.ini
@@ -7,3 +7,5 @@ filterwarnings =
     ignore:invalid value encountered in divide 
     ignore:Poolsize
     ignore:y contains no unlabeled samples
+    ignore::FutureWarning
+    ignore::DeprecationWarning
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-joblib==1.2.0
-numpy==1.23.3
-pandas==1.4.3
-scikit_learn==1.2.0
-scipy==1.10.1
-statsmodels==0.13.2
+joblib>=1.2.0
+numpy>=1.23.3
+pandas>=1.4.3
+scikit_learn>=1.2.0
+scipy>=1.10.1
+statsmodels>=0.13.2
diff --git a/sslearn/__init__.py b/sslearn/__init__.py
@@ -1,4 +1,4 @@
-__version__='1.0.3.1'
+__version__='1.0.4'
 __AUTHOR__="José Luis Garrido-Labrador"  # Author of the package
 __AUTHOR_EMAIL__="jlgarrido@ubu.es"  # Author's email
 __URL__="https://pypi.org/project/sslearn/"
diff --git a/sslearn/base.py b/sslearn/base.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp
-from joblib import Parallel
+from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
 from sklearn.base import clone as skclone
 from sklearn.base import is_classifier
@@ -14,7 +14,6 @@
                                 _predict_binary)
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils import check_X_y, check_array
-from sklearn.utils.fixes import delayed
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.metaestimators import available_if
 from sklearn.ensemble._base import _set_random_states
@@ -61,9 +60,15 @@ def predict(self, X):
             Array with predicted labels.
         """
         predicted_probabilitiy = self.predict_proba(X)
-        return self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)),
+        classes = self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)),
                                   axis=0)
 
+        # If exists label_encoder_ attribute, use it to transform classes
+        if hasattr(self, "label_encoder_"):
+            classes = self.label_encoder_.inverse_transform(classes)
+
+        return classes
+
 
 class FakedProbaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
 

diff --git a/sslearn/model_selection/_split.py b/sslearn/model_selection/_split.py
@@ -1,4 +1,5 @@
 import sklearn.model_selection as ms
+from sklearn.utils import check_random_state
 import numpy as np
 
 
@@ -46,7 +47,7 @@ def split(self, X, y):
             yield X_, y_, label, unlabel
 
 
-def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards):
+def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimum=None, **kwards):
     """Create an artificial Semi-supervised dataset from a supervised dataset.
 
     Parameters
@@ -60,6 +61,8 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards):
         Proportion between labeled instances and unlabel instances, by default 0.1
     random_state : int or RandomState, optional
         Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls, by default None
+    force_minimum: int, optional
+        Force a minimum of instances of each class, by default None
     shuffle: bool, default=True
         Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None.
     stratify: array-like, default=None
@@ -80,11 +83,40 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards):
         "Label rate must be in (0, 1)."
     assert "test_size" not in kwards and "train_size" not in kwards,\
         "Test size and train size are illegal parameters in this method."
+
+    if force_minimum is not None:
+        try:
+            selected = __random_select_n_instances(y, force_minimum, random_state)
+        except ValueError:
+            raise ValueError("The number of instances of each class is less than force_minimum.")
+        X_selected = X[selected]
+        y_selected = y[selected]
+
+        # Remove selected instances from X and y
+        X = np.delete(X, selected, axis=0)
+        y = np.delete(y, selected, axis=0)
+
     X_label, X_unlabel, y_label, true_label = \
         ms.train_test_split(X, y,
                             train_size=label_rate,
                             random_state=random_state, **kwards)
     X = np.concatenate((X_label, X_unlabel), axis=0)
     y = np.concatenate((y_label, np.array([-1] * len(true_label))), axis=0)
 
+    if force_minimum is not None:
+        X = np.concatenate((X, X_selected), axis=0)
+        y = np.concatenate((y, y_selected), axis=0)
+
     return X, y, X_unlabel, true_label
+
+def __random_select_n_instances(y, n, random_state):
+
+    # Select n instances of each class randomly
+    classes = np.unique(y)
+    selected = []
+    random_state = check_random_state(random_state)
+    for c in classes:
+        idx = np.where(y == c)[0]
+        selected.append(random_state.choice(idx, n, replace=False))
+    selected = np.concatenate(selected)
+    return selected
diff --git a/sslearn/wrapper/_co.py b/sslearn/wrapper/_co.py
@@ -779,7 +779,8 @@ def __init__(
         random_state=None,
         n_jobs=None,
     ):
-        """Co-Training with relevant random subspaces
+        """
+        Co-Training with relevant random subspaces
 
         Yaslan, Y., & Cataltepe, Z. (2010).
         Co-training with relevant random subspaces.
@@ -1019,7 +1020,6 @@ def score(self, X, y, sample_weight=None):
 
         return self.ensemble_estimator.score(X, y, sample_weight)
 
-
 # Done and tested
 class CoForest(BaseCoTraining):
     def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=7, threshold=0.75, bootstrap=True, n_jobs=None, random_state=None, version="1.0.3"):

diff --git a/sslearn/wrapper/_tritraining.py b/sslearn/wrapper/_tritraining.py
@@ -10,13 +10,16 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.utils import check_random_state, resample
+from sklearn.preprocessing import LabelEncoder
 from sklearn.exceptions import ConvergenceWarning
 
 from ..base import get_dataset
 from ..restricted import WhoIsWhoClassifier, combine_predictions
 from ..utils import check_classifier, check_n_jobs, safe_division
 from ._co import BaseCoTraining
 
+import time
+
 
 class TriTraining(BaseCoTraining):
 
@@ -557,9 +560,10 @@ def _depure(self, S):
         tuple (X, y)
             Enlarged dataset with instances where at least k_neighbors/2+1 have the same class.
         """
-        # k_neighbors +1 to ignore the own instance.
-        knn = KNeighborsClassifier(n_neighbors=self.k_neighbors + 1, n_jobs=self.n_jobs)
+        init = time.time()
+        knn = KNeighborsClassifier(n_neighbors=self.k_neighbors, n_jobs=self.n_jobs)
         valid = knn.fit(*S).predict(S[0]) == S[1]
+        print(f"Depure time: {time.time() - init}")
         return S[0][valid], S[1][valid]
 
     def _clustering(self, S, X):
@@ -589,22 +593,29 @@ class predicted for each instance
         for k in clusters:
             centroids[k] = np.mean(S[0][S[1] == k], axis=0)
 
-        def seeded(x):
-            min_ = np.inf
-            k_min = None
-            for k in centroids:
-                candidate = np.linalg.norm(x - centroids[k])
-                if candidate < min_ or k_min is None:
-                    min_ = candidate
-                    k_min = k
-            return k_min
-
-        def constrained(x):
-            candidate = S[1][(S[0] == x).sum(axis=1) == X.shape[1]]
-            if len(candidate) == 0:
-                return seeded(x)
-            else:
-                return candidate[0]
+        def seeded(X):
+            # For each instance, calculate the distance to each centroid
+            distances = np.linalg.norm(X[:, None, :] - np.array(list(centroids.values())), axis=2)
+            # Get the index of the nearest centroid
+            return np.argmin(distances, axis=1)
+
+        def constrained(X):
+            # Calculate the distances to centroids using broadcasting
+            distances = np.linalg.norm(X[:, None, :] - np.array(list(centroids.values())), axis=2)
+            # Get the index of the nearest centroid
+            nearest = np.argmin(distances, axis=1)
+            # Create a mask to find instances in X that belong to S[0]
+            mask = (S[0] == X[:, None])
+            # Find the row and column indices where all elements are True
+            i, j = np.where(mask.all(axis=2))
+            # Initialize cluster with -1
+            cluster = np.full(X.shape[0], -1, dtype=int)
+            # Update cluster for the instances found in S[0]
+            cluster[i] = S[1][j]
+            # Update cluster for instances not found in S[0]
+            cluster[cluster == -1] = nearest[cluster == -1]
+
+            return cluster
 
         if self.mode == "seeded":
             op = seeded
@@ -617,7 +628,7 @@ def constrained(x):
             changes = False
             iterations += 1
             # Need to vectorize
-            new_clusters = np.apply_along_axis(op, 1, X)
+            new_clusters = op(X)
             new_centroids = dict()
             for k in clusters:
                 if np.any(new_clusters == k):
@@ -645,6 +656,10 @@ def fit(self, X, y, **kwards):
         """
         X_label, y_label, X_unlabel = get_dataset(X, y)
 
+        self.label_encoder_ = LabelEncoder()
+        self.label_encoder_.fit(y_label)
+        y_label = self.label_encoder_.transform(y_label)
+
         is_df = isinstance(X_label, pd.DataFrame)
 
         self.classes_ = np.unique(y_label)
@@ -687,9 +702,9 @@ def fit(self, X, y, **kwards):
 
             S_.append((X_sampled, y_sampled))
 
-        changes = True 
+        changes = True
         last_addition = [0] * self._N_LEARNER
-        it = 0 if X_unlabel.shape[0] > 0  else self.max_iterations
+        it = 0 if X_unlabel.shape[0] > 0 else self.max_iterations
         while it < self.max_iterations:
             it += 1
             changes = False
@@ -704,6 +719,7 @@ def fit(self, X, y, **kwards):
                 L[i] = (X_unlabel[validx] if not is_df else X_unlabel.iloc[validx, :], y_p[validx])
 
             for i, _ in enumerate(L):
+
                 if len(L[i][0]) > 0:
                     S_[i] = np.concatenate((X_label, L[i][0])) if not is_df else pd.concat([X_label, L[i][0]]), np.concatenate((y_label, L[i][1]))
                     S_[i] = self._depure(S_[i])

diff --git a/test/test_model_selection.py b/test/test_model_selection.py
@@ -1,6 +1,6 @@
 import os
 import sys
-
+import numpy as np
 import pytest
 
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
@@ -14,6 +14,25 @@ def test_artificial_ssl_dataset():
     assert X_unlabel.shape[0] == true_label.shape[0]
     assert X_unlabel.shape[0]/X.shape[0] == pytest.approx(0.9)
 
+def test_artificial_ssl_dataset_with_force_minimum():
+    X, y = load_iris(return_X_y=True)
+    # The first class only 10 instances
+    first_class = np.unique(y)[0]
+    X_0 = X[y == first_class][0]
+    y_0 = y[y == first_class][0]
+    # Keep only 1 instance of first class
+    X = X[y != first_class]
+    y = y[y != first_class]
+    X = np.concatenate((X, [X_0]), axis=0)
+    y = np.concatenate((y, [y_0]), axis=0)    
+
+    X, y, X_unlabel, true_label = artificial_ssl_dataset(X, y, label_rate=0.02, force_minimum=1)
+    assert X_unlabel.shape[0] == true_label.shape[0]
+    for i in np.unique(y):
+        assert np.sum(y == i) >= 1
+
+    pytest.raises(ValueError, artificial_ssl_dataset, X, y, label_rate=0.02, force_minimum=2)
+
 def test_StratifiedKFoldSS():
     X, y = load_iris(return_X_y=True)
     splits = 5