diff --git a/CHANGELOG.md b/CHANGELOG.md index e9ca384..32c43aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.4] - 2024-01-31 + +### Added +- Add a parameter to `artificial_ssl_dataset` to force a minimum of instances. Issue #11 + +### Fixed +- DeTriTraining now is vectorized and is faster than before. + # [1.0.3.1] - 2023-04-01 ### Changed diff --git a/pytest.ini b/pytest.ini index 9528b2e..21cdd60 100644 --- a/pytest.ini +++ b/pytest.ini @@ -7,3 +7,5 @@ filterwarnings = ignore:invalid value encountered in divide ignore:Poolsize ignore:y contains no unlabeled samples + ignore::FutureWarning + ignore::DeprecationWarning diff --git a/requirements.txt b/requirements.txt index af65cc9..8c3e48a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -joblib==1.2.0 -numpy==1.23.3 -pandas==1.4.3 -scikit_learn==1.2.0 -scipy==1.10.1 -statsmodels==0.13.2 +joblib>=1.2.0 +numpy>=1.23.3 +pandas>=1.4.3 +scikit_learn>=1.2.0 +scipy>=1.10.1 +statsmodels>=0.13.2 diff --git a/sslearn/__init__.py b/sslearn/__init__.py index ce6460e..6b0785a 100644 --- a/sslearn/__init__.py +++ b/sslearn/__init__.py @@ -1,4 +1,4 @@ -__version__='1.0.3.1' +__version__='1.0.4' __AUTHOR__="José Luis Garrido-Labrador" # Author of the package __AUTHOR_EMAIL__="jlgarrido@ubu.es" # Author's email __URL__="https://pypi.org/project/sslearn/" diff --git a/sslearn/base.py b/sslearn/base.py index cb5f722..158010d 100644 --- a/sslearn/base.py +++ b/sslearn/base.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd import scipy.sparse as sp -from joblib import Parallel +from joblib import Parallel, delayed from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin from sklearn.base import clone as skclone from sklearn.base import is_classifier @@ -14,7 +14,6 @@ _predict_binary) from sklearn.preprocessing import OneHotEncoder from sklearn.utils import check_X_y, check_array -from sklearn.utils.fixes import delayed from sklearn.utils.validation import check_is_fitted from sklearn.utils.metaestimators import available_if from sklearn.ensemble._base import _set_random_states @@ -61,9 +60,15 @@ def predict(self, X): Array with predicted labels. """ predicted_probabilitiy = self.predict_proba(X) - return self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)), + classes = self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)), axis=0) + # If exists label_encoder_ attribute, use it to transform classes + if hasattr(self, "label_encoder_"): + classes = self.label_encoder_.inverse_transform(classes) + + return classes + class FakedProbaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): diff --git a/sslearn/model_selection/_split.py b/sslearn/model_selection/_split.py index ca36084..581f6ee 100644 --- a/sslearn/model_selection/_split.py +++ b/sslearn/model_selection/_split.py @@ -1,4 +1,5 @@ import sklearn.model_selection as ms +from sklearn.utils import check_random_state import numpy as np @@ -46,7 +47,7 @@ def split(self, X, y): yield X_, y_, label, unlabel -def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards): +def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, force_minimum=None, **kwards): """Create an artificial Semi-supervised dataset from a supervised dataset. Parameters @@ -60,6 +61,8 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards): Proportion between labeled instances and unlabel instances, by default 0.1 random_state : int or RandomState, optional Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls, by default None + force_minimum: int, optional + Force a minimum of instances of each class, by default None shuffle: bool, default=True Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. stratify: array-like, default=None @@ -80,6 +83,19 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards): "Label rate must be in (0, 1)." assert "test_size" not in kwards and "train_size" not in kwards,\ "Test size and train size are illegal parameters in this method." + + if force_minimum is not None: + try: + selected = __random_select_n_instances(y, force_minimum, random_state) + except ValueError: + raise ValueError("The number of instances of each class is less than force_minimum.") + X_selected = X[selected] + y_selected = y[selected] + + # Remove selected instances from X and y + X = np.delete(X, selected, axis=0) + y = np.delete(y, selected, axis=0) + X_label, X_unlabel, y_label, true_label = \ ms.train_test_split(X, y, train_size=label_rate, @@ -87,4 +103,20 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards): X = np.concatenate((X_label, X_unlabel), axis=0) y = np.concatenate((y_label, np.array([-1] * len(true_label))), axis=0) + if force_minimum is not None: + X = np.concatenate((X, X_selected), axis=0) + y = np.concatenate((y, y_selected), axis=0) + return X, y, X_unlabel, true_label + +def __random_select_n_instances(y, n, random_state): + + # Select n instances of each class randomly + classes = np.unique(y) + selected = [] + random_state = check_random_state(random_state) + for c in classes: + idx = np.where(y == c)[0] + selected.append(random_state.choice(idx, n, replace=False)) + selected = np.concatenate(selected) + return selected \ No newline at end of file diff --git a/sslearn/wrapper/_co.py b/sslearn/wrapper/_co.py index 70b12ca..7b266f1 100644 --- a/sslearn/wrapper/_co.py +++ b/sslearn/wrapper/_co.py @@ -779,7 +779,8 @@ def __init__( random_state=None, n_jobs=None, ): - """Co-Training with relevant random subspaces + """ + Co-Training with relevant random subspaces Yaslan, Y., & Cataltepe, Z. (2010). Co-training with relevant random subspaces. @@ -1019,7 +1020,6 @@ def score(self, X, y, sample_weight=None): return self.ensemble_estimator.score(X, y, sample_weight) - # Done and tested class CoForest(BaseCoTraining): def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=7, threshold=0.75, bootstrap=True, n_jobs=None, random_state=None, version="1.0.3"): diff --git a/sslearn/wrapper/_tritraining.py b/sslearn/wrapper/_tritraining.py index ce77809..4370c7d 100644 --- a/sslearn/wrapper/_tritraining.py +++ b/sslearn/wrapper/_tritraining.py @@ -10,6 +10,7 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state, resample +from sklearn.preprocessing import LabelEncoder from sklearn.exceptions import ConvergenceWarning from ..base import get_dataset @@ -17,6 +18,8 @@ from ..utils import check_classifier, check_n_jobs, safe_division from ._co import BaseCoTraining +import time + class TriTraining(BaseCoTraining): @@ -557,9 +560,10 @@ def _depure(self, S): tuple (X, y) Enlarged dataset with instances where at least k_neighbors/2+1 have the same class. """ - # k_neighbors +1 to ignore the own instance. - knn = KNeighborsClassifier(n_neighbors=self.k_neighbors + 1, n_jobs=self.n_jobs) + init = time.time() + knn = KNeighborsClassifier(n_neighbors=self.k_neighbors, n_jobs=self.n_jobs) valid = knn.fit(*S).predict(S[0]) == S[1] + print(f"Depure time: {time.time() - init}") return S[0][valid], S[1][valid] def _clustering(self, S, X): @@ -589,22 +593,29 @@ class predicted for each instance for k in clusters: centroids[k] = np.mean(S[0][S[1] == k], axis=0) - def seeded(x): - min_ = np.inf - k_min = None - for k in centroids: - candidate = np.linalg.norm(x - centroids[k]) - if candidate < min_ or k_min is None: - min_ = candidate - k_min = k - return k_min - - def constrained(x): - candidate = S[1][(S[0] == x).sum(axis=1) == X.shape[1]] - if len(candidate) == 0: - return seeded(x) - else: - return candidate[0] + def seeded(X): + # For each instance, calculate the distance to each centroid + distances = np.linalg.norm(X[:, None, :] - np.array(list(centroids.values())), axis=2) + # Get the index of the nearest centroid + return np.argmin(distances, axis=1) + + def constrained(X): + # Calculate the distances to centroids using broadcasting + distances = np.linalg.norm(X[:, None, :] - np.array(list(centroids.values())), axis=2) + # Get the index of the nearest centroid + nearest = np.argmin(distances, axis=1) + # Create a mask to find instances in X that belong to S[0] + mask = (S[0] == X[:, None]) + # Find the row and column indices where all elements are True + i, j = np.where(mask.all(axis=2)) + # Initialize cluster with -1 + cluster = np.full(X.shape[0], -1, dtype=int) + # Update cluster for the instances found in S[0] + cluster[i] = S[1][j] + # Update cluster for instances not found in S[0] + cluster[cluster == -1] = nearest[cluster == -1] + + return cluster if self.mode == "seeded": op = seeded @@ -617,7 +628,7 @@ def constrained(x): changes = False iterations += 1 # Need to vectorize - new_clusters = np.apply_along_axis(op, 1, X) + new_clusters = op(X) new_centroids = dict() for k in clusters: if np.any(new_clusters == k): @@ -645,6 +656,10 @@ def fit(self, X, y, **kwards): """ X_label, y_label, X_unlabel = get_dataset(X, y) + self.label_encoder_ = LabelEncoder() + self.label_encoder_.fit(y_label) + y_label = self.label_encoder_.transform(y_label) + is_df = isinstance(X_label, pd.DataFrame) self.classes_ = np.unique(y_label) @@ -687,9 +702,9 @@ def fit(self, X, y, **kwards): S_.append((X_sampled, y_sampled)) - changes = True + changes = True last_addition = [0] * self._N_LEARNER - it = 0 if X_unlabel.shape[0] > 0 else self.max_iterations + it = 0 if X_unlabel.shape[0] > 0 else self.max_iterations while it < self.max_iterations: it += 1 changes = False @@ -704,6 +719,7 @@ def fit(self, X, y, **kwards): L[i] = (X_unlabel[validx] if not is_df else X_unlabel.iloc[validx, :], y_p[validx]) for i, _ in enumerate(L): + if len(L[i][0]) > 0: S_[i] = np.concatenate((X_label, L[i][0])) if not is_df else pd.concat([X_label, L[i][0]]), np.concatenate((y_label, L[i][1])) S_[i] = self._depure(S_[i]) diff --git a/test/test_model_selection.py b/test/test_model_selection.py index 02dc99c..69331be 100644 --- a/test/test_model_selection.py +++ b/test/test_model_selection.py @@ -1,6 +1,6 @@ import os import sys - +import numpy as np import pytest sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) @@ -14,6 +14,25 @@ def test_artificial_ssl_dataset(): assert X_unlabel.shape[0] == true_label.shape[0] assert X_unlabel.shape[0]/X.shape[0] == pytest.approx(0.9) +def test_artificial_ssl_dataset_with_force_minimum(): + X, y = load_iris(return_X_y=True) + # The first class only 10 instances + first_class = np.unique(y)[0] + X_0 = X[y == first_class][0] + y_0 = y[y == first_class][0] + # Keep only 1 instance of first class + X = X[y != first_class] + y = y[y != first_class] + X = np.concatenate((X, [X_0]), axis=0) + y = np.concatenate((y, [y_0]), axis=0) + + X, y, X_unlabel, true_label = artificial_ssl_dataset(X, y, label_rate=0.02, force_minimum=1) + assert X_unlabel.shape[0] == true_label.shape[0] + for i in np.unique(y): + assert np.sum(y == i) >= 1 + + pytest.raises(ValueError, artificial_ssl_dataset, X, y, label_rate=0.02, force_minimum=2) + def test_StratifiedKFoldSS(): X, y = load_iris(return_X_y=True) splits = 5