From cd7c7f3938291991c23f90936570021eb2339452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 22 Feb 2022 20:40:35 +0100 Subject: [PATCH 1/5] First try to fix initialization issue --- odte/Odte.py | 73 +++++++++++++++++++++------------------- odte/_version.py | 2 +- odte/tests/Odte_tests.py | 12 +++++++ 3 files changed, 52 insertions(+), 35 deletions(-) diff --git a/odte/Odte.py b/odte/Odte.py index 036be0a..4b2809f 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -26,6 +26,35 @@ from ._version import __version__ +def _parallel_build_tree( + base_estimator_: Stree, + X: np.ndarray, + y: np.ndarray, + weights: np.ndarray, + random_box: np.random.mtrand.RandomState, + random_seed: int, + boot_samples: int, + max_features: int, + hyperparams: str, +) -> Tuple[BaseEstimator, Tuple[int, ...]]: + clf = base_estimator_ + hyperparams_ = json.loads(hyperparams) + hyperparams_.update(dict(random_state=random_seed)) + clf.set_params(**hyperparams_) + n_samples = X.shape[0] + # bootstrap + indices = random_box.randint(0, n_samples, boot_samples) + # update weights with the chosen samples + weights_update = np.bincount(indices, minlength=n_samples) + current_weights = weights * weights_update + # random subspace + features = Odte._get_random_subspace(X, y, max_features) + # train the classifier + bootstrap = X[indices, :] + clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) + return (clf, features) + + class Odte(BaseEnsemble, ClassifierMixin): def __init__( self, @@ -109,45 +138,18 @@ def _compute_metrics(self) -> None: self.leaves_ = tleaves / self.n_estimators self.nodes_ = tnodes / self.n_estimators - @staticmethod - def _parallel_build_tree( - base_estimator_: Stree, - X: np.ndarray, - y: np.ndarray, - weights: np.ndarray, - random_box: np.random.mtrand.RandomState, - random_seed: int, - boot_samples: int, - max_features: int, - hyperparams: str, - ) -> Tuple[BaseEstimator, Tuple[int, ...]]: - clf = clone(base_estimator_) - hyperparams_ = json.loads(hyperparams) - hyperparams_.update(dict(random_state=random_seed)) - clf.set_params(**hyperparams_) - n_samples = X.shape[0] - # bootstrap - indices = random_box.randint(0, n_samples, boot_samples) - # update weights with the chosen samples - weights_update = np.bincount(indices, minlength=n_samples) - current_weights = weights * weights_update - # random subspace - features = Odte._get_random_subspace(X, y, max_features) - # train the classifier - bootstrap = X[indices, :] - clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) - return (clf, features) - def _train( self, X: np.ndarray, y: np.ndarray, weights: np.ndarray ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: random_box = self._initialize_random() n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) - clf = clone(self.base_estimator_) + estimator = [] + for i in range(self.n_estimators): + estimator.append(clone(self.base_estimator_)) return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore - delayed(Odte._parallel_build_tree)( - clf, + delayed(_parallel_build_tree)( + estimator[i], X, y, weights, @@ -157,8 +159,11 @@ def _train( self.max_features_, self.be_hyperparams, ) - for random_seed in range( - self.random_state, self.random_state + self.n_estimators + for random_seed, i in zip( + range( + self.random_state, self.random_state + self.n_estimators + ), + range(self.n_estimators), ) ) diff --git a/odte/_version.py b/odte/_version.py index 260c070..f9aa3e1 100644 --- a/odte/_version.py +++ b/odte/_version.py @@ -1 +1 @@ -__version__ = "0.3.1" +__version__ = "0.3.2" diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index f19f063..21abf07 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -257,3 +257,15 @@ def test_base_estimator_hyperparams(self): def test_version(self): tclf = Odte() self.assertEqual(__version__, tclf.version()) + + def test_parallel_score(self): + tclf_p = Odte( + n_jobs=-1, random_state=self._random_state, n_estimators=30 + ) + tclf_s = Odte( + n_jobs=1, random_state=self._random_state, n_estimators=30 + ) + X, y = load_dataset(self._random_state, n_features=56, n_samples=1500) + tclf_p.fit(X, y) + tclf_s.fit(X, y) + self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y)) From 3766886190396cf6916fa8ba690ba26f79136a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 23 Feb 2022 12:02:59 +0100 Subject: [PATCH 2/5] Fix np.random initialization --- odte/Odte.py | 19 +++-------- odte/tests/Odte_tests.py | 68 +++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/odte/Odte.py b/odte/Odte.py index 4b2809f..875da85 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -16,6 +16,7 @@ check_classification_targets, ) from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore +from sklearn.utils import check_random_state from sklearn.ensemble import BaseEnsemble # type: ignore from sklearn.utils.validation import ( # type: ignore check_is_fitted, @@ -31,7 +32,6 @@ def _parallel_build_tree( X: np.ndarray, y: np.ndarray, weights: np.ndarray, - random_box: np.random.mtrand.RandomState, random_seed: int, boot_samples: int, max_features: int, @@ -43,6 +43,7 @@ def _parallel_build_tree( clf.set_params(**hyperparams_) n_samples = X.shape[0] # bootstrap + random_box = check_random_state(random_seed) indices = random_box.randint(0, n_samples, boot_samples) # update weights with the chosen samples weights_update = np.bincount(indices, minlength=n_samples) @@ -83,12 +84,6 @@ def __init__( def version() -> str: return __version__ - def _initialize_random(self) -> np.random.mtrand.RandomState: - if self.random_state is None: - self.random_state = random.randint(0, sys.maxsize) - return np.random.mtrand._rand - return np.random.RandomState(self.random_state) - def _validate_estimator(self) -> None: """Check the estimator and set the base_estimator_ attribute.""" super()._validate_estimator( @@ -141,7 +136,7 @@ def _compute_metrics(self) -> None: def _train( self, X: np.ndarray, y: np.ndarray, weights: np.ndarray ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: - random_box = self._initialize_random() + # np.random.RandomState(seed) n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) estimator = [] @@ -153,17 +148,13 @@ def _train( X, y, weights, - random_box, random_seed, boot_samples, self.max_features_, self.be_hyperparams, ) - for random_seed, i in zip( - range( - self.random_state, self.random_state + self.n_estimators - ), - range(self.n_estimators), + for i, random_seed in enumerate( + range(self.random_state, self.random_state + self.n_estimators) ) ) diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index 21abf07..9945016 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -54,20 +54,6 @@ def test_initialize_max_feature(self): self.assertListEqual(expected, list(computed)) # print(f"{list(computed)},") - def test_initialize_random(self): - expected = [37, 235, 908] - tclf = Odte(random_state=self._random_state) - box = tclf._initialize_random() - computed = box.randint(0, 1000, 3) - self.assertListEqual(expected, computed.tolist()) - # test None - tclf = Odte(random_state=None) - box = tclf._initialize_random() - computed = box.randint(101, 1000, 3) - for value in computed.tolist(): - self.assertGreaterEqual(value, 101) - self.assertLessEqual(value, 1000) - def test_bogus_max_features(self): values = ["duck", -0.1, 0.0] for max_features in values: @@ -124,7 +110,7 @@ def test_predict(self): def test_score(self): X, y = load_dataset(self._random_state) - expected = 0.9513333333333334 + expected = 0.9533333333333334 tclf = Odte( random_state=self._random_state, max_features=None, @@ -136,19 +122,18 @@ def test_score(self): def test_score_splitter_max_features(self): X, y = load_dataset(self._random_state, n_features=16, n_samples=500) results = [ - 0.948, - 0.924, - 0.926, - 0.94, - 0.932, - 0.936, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, + 0.958, # best auto + 0.942, # random auto + 0.932, # trandom auto + 0.95, # mutual auto + 0.944, # iwss auto + 0.946, # cfs auto + 0.97, # best None + 0.97, # random None + 0.97, # trandom None + 0.97, # mutual None + 0.97, # iwss None + 0.97, # cfs None ] random.seed(self._random_state) for max_features in ["auto", None]: @@ -208,15 +193,32 @@ def test_nodes_leaves_depth(self): base_estimator=Stree(), random_state=self._random_state, n_estimators=3, + n_jobs=1, + ) + X, y = load_dataset(self._random_state, n_features=16, n_samples=500) + tclf.fit(X, y) + self.assertAlmostEqual(6.333333333333333, tclf.depth_) + self.assertAlmostEqual(10.0, tclf.leaves_) + self.assertAlmostEqual(19.0, tclf.nodes_) + nodes, leaves = tclf.nodes_leaves() + self.assertAlmostEqual(10.0, leaves) + self.assertAlmostEqual(19, nodes) + + def test_nodes_leaves_depth_parallel(self): + tclf = Odte( + base_estimator=Stree(), + random_state=self._random_state, + n_estimators=3, + n_jobs=-1, ) X, y = load_dataset(self._random_state, n_features=16, n_samples=500) tclf.fit(X, y) - self.assertAlmostEqual(6.0, tclf.depth_) - self.assertAlmostEqual(9.333333333333334, tclf.leaves_) - self.assertAlmostEqual(17.666666666666668, tclf.nodes_) + self.assertAlmostEqual(6.333333333333333, tclf.depth_) + self.assertAlmostEqual(10.0, tclf.leaves_) + self.assertAlmostEqual(19.0, tclf.nodes_) nodes, leaves = tclf.nodes_leaves() - self.assertAlmostEqual(9.333333333333334, leaves) - self.assertAlmostEqual(17.666666666666668, nodes) + self.assertAlmostEqual(10.0, leaves) + self.assertAlmostEqual(19, nodes) def test_nodes_leaves_SVC(self): tclf = Odte( From 9e5fe8c79138de669e400d3cbb890ec3b4ca616a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 23 Feb 2022 12:10:12 +0100 Subject: [PATCH 3/5] Fix flake req. remove uneeded sys mod --- odte/Odte.py | 1 - 1 file changed, 1 deletion(-) diff --git a/odte/Odte.py b/odte/Odte.py index 875da85..a396da7 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -7,7 +7,6 @@ """ from __future__ import annotations import random -import sys import json from math import factorial from typing import Union, Optional, Tuple, List, Set From 877c24f3f4e528c419f7ddbb706ea91a87ce7ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Fri, 25 Feb 2022 19:24:44 +0100 Subject: [PATCH 4/5] fix rc1 --- odte/Odte.py | 71 +++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/odte/Odte.py b/odte/Odte.py index a396da7..8c9c059 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -26,35 +26,6 @@ from ._version import __version__ -def _parallel_build_tree( - base_estimator_: Stree, - X: np.ndarray, - y: np.ndarray, - weights: np.ndarray, - random_seed: int, - boot_samples: int, - max_features: int, - hyperparams: str, -) -> Tuple[BaseEstimator, Tuple[int, ...]]: - clf = base_estimator_ - hyperparams_ = json.loads(hyperparams) - hyperparams_.update(dict(random_state=random_seed)) - clf.set_params(**hyperparams_) - n_samples = X.shape[0] - # bootstrap - random_box = check_random_state(random_seed) - indices = random_box.randint(0, n_samples, boot_samples) - # update weights with the chosen samples - weights_update = np.bincount(indices, minlength=n_samples) - current_weights = weights * weights_update - # random subspace - features = Odte._get_random_subspace(X, y, max_features) - # train the classifier - bootstrap = X[indices, :] - clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) - return (clf, features) - - class Odte(BaseEnsemble, ClassifierMixin): def __init__( self, @@ -135,15 +106,12 @@ def _compute_metrics(self) -> None: def _train( self, X: np.ndarray, y: np.ndarray, weights: np.ndarray ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: - # np.random.RandomState(seed) n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) - estimator = [] - for i in range(self.n_estimators): - estimator.append(clone(self.base_estimator_)) + estimator = clone(self.base_estimator_) return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore - delayed(_parallel_build_tree)( - estimator[i], + delayed(Odte._parallel_build_tree)( + estimator, X, y, weights, @@ -152,11 +120,40 @@ def _train( self.max_features_, self.be_hyperparams, ) - for i, random_seed in enumerate( - range(self.random_state, self.random_state + self.n_estimators) + for random_seed in range( + self.random_state, self.random_state + self.n_estimators ) ) + @staticmethod + def _parallel_build_tree( + base_estimator_: BaseEstimator, + X: np.ndarray, + y: np.ndarray, + weights: np.ndarray, + random_seed: int, + boot_samples: int, + max_features: int, + hyperparams: str, + ) -> Tuple[BaseEstimator, Tuple[int, ...]]: + clf = clone(base_estimator_) + hyperparams_ = json.loads(hyperparams) + hyperparams_.update(dict(random_state=random_seed)) + clf.set_params(**hyperparams_) + n_samples = X.shape[0] + # bootstrap + random_box = check_random_state(random_seed) + indices = random_box.randint(0, n_samples, boot_samples) + # update weights with the chosen samples + weights_update = np.bincount(indices, minlength=n_samples) + current_weights = weights * weights_update + # random subspace + features = Odte._get_random_subspace(X, y, max_features) + # train the classifier + bootstrap = X[indices, :] + clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) + return (clf, features) + def _get_bootstrap_n_samples(self, n_samples: int) -> int: if self.max_samples is None: return n_samples From dda3517090460f0f33730f40d49c3e76ec3884b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 26 Feb 2022 11:30:12 +0100 Subject: [PATCH 5/5] merge two tests parallel-sequential --- odte/tests/Odte_tests.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index 9945016..01ca3bc 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -192,33 +192,25 @@ def test_nodes_leaves_depth(self): tclf = Odte( base_estimator=Stree(), random_state=self._random_state, - n_estimators=3, + n_estimators=5, n_jobs=1, ) - X, y = load_dataset(self._random_state, n_features=16, n_samples=500) - tclf.fit(X, y) - self.assertAlmostEqual(6.333333333333333, tclf.depth_) - self.assertAlmostEqual(10.0, tclf.leaves_) - self.assertAlmostEqual(19.0, tclf.nodes_) - nodes, leaves = tclf.nodes_leaves() - self.assertAlmostEqual(10.0, leaves) - self.assertAlmostEqual(19, nodes) - - def test_nodes_leaves_depth_parallel(self): - tclf = Odte( + tclf_p = Odte( base_estimator=Stree(), random_state=self._random_state, - n_estimators=3, + n_estimators=5, n_jobs=-1, ) X, y = load_dataset(self._random_state, n_features=16, n_samples=500) tclf.fit(X, y) - self.assertAlmostEqual(6.333333333333333, tclf.depth_) - self.assertAlmostEqual(10.0, tclf.leaves_) - self.assertAlmostEqual(19.0, tclf.nodes_) - nodes, leaves = tclf.nodes_leaves() - self.assertAlmostEqual(10.0, leaves) - self.assertAlmostEqual(19, nodes) + tclf_p.fit(X, y) + for clf in [tclf, tclf_p]: + self.assertAlmostEqual(5.8, clf.depth_) + self.assertAlmostEqual(9.4, clf.leaves_) + self.assertAlmostEqual(17.8, clf.nodes_) + nodes, leaves = clf.nodes_leaves() + self.assertAlmostEqual(9.4, leaves) + self.assertAlmostEqual(17.8, nodes) def test_nodes_leaves_SVC(self): tclf = Odte(