diff --git a/odte/Odte.py b/odte/Odte.py index 036be0a..8c9c059 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -7,7 +7,6 @@ """ from __future__ import annotations import random -import sys import json from math import factorial from typing import Union, Optional, Tuple, List, Set @@ -16,6 +15,7 @@ check_classification_targets, ) from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore +from sklearn.utils import check_random_state from sklearn.ensemble import BaseEnsemble # type: ignore from sklearn.utils.validation import ( # type: ignore check_is_fitted, @@ -54,12 +54,6 @@ def __init__( def version() -> str: return __version__ - def _initialize_random(self) -> np.random.mtrand.RandomState: - if self.random_state is None: - self.random_state = random.randint(0, sys.maxsize) - return np.random.mtrand._rand - return np.random.RandomState(self.random_state) - def _validate_estimator(self) -> None: """Check the estimator and set the base_estimator_ attribute.""" super()._validate_estimator( @@ -109,13 +103,34 @@ def _compute_metrics(self) -> None: self.leaves_ = tleaves / self.n_estimators self.nodes_ = tnodes / self.n_estimators + def _train( + self, X: np.ndarray, y: np.ndarray, weights: np.ndarray + ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: + n_samples = X.shape[0] + boot_samples = self._get_bootstrap_n_samples(n_samples) + estimator = clone(self.base_estimator_) + return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore + delayed(Odte._parallel_build_tree)( + estimator, + X, + y, + weights, + random_seed, + boot_samples, + self.max_features_, + self.be_hyperparams, + ) + for random_seed in range( + self.random_state, self.random_state + self.n_estimators + ) + ) + @staticmethod def _parallel_build_tree( - base_estimator_: Stree, + base_estimator_: BaseEstimator, X: np.ndarray, y: np.ndarray, weights: np.ndarray, - random_box: np.random.mtrand.RandomState, random_seed: int, boot_samples: int, max_features: int, @@ -127,6 +142,7 @@ def _parallel_build_tree( clf.set_params(**hyperparams_) n_samples = X.shape[0] # bootstrap + random_box = check_random_state(random_seed) indices = random_box.randint(0, n_samples, boot_samples) # update weights with the chosen samples weights_update = np.bincount(indices, minlength=n_samples) @@ -138,30 +154,6 @@ def _parallel_build_tree( clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) return (clf, features) - def _train( - self, X: np.ndarray, y: np.ndarray, weights: np.ndarray - ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: - random_box = self._initialize_random() - n_samples = X.shape[0] - boot_samples = self._get_bootstrap_n_samples(n_samples) - clf = clone(self.base_estimator_) - return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore - delayed(Odte._parallel_build_tree)( - clf, - X, - y, - weights, - random_box, - random_seed, - boot_samples, - self.max_features_, - self.be_hyperparams, - ) - for random_seed in range( - self.random_state, self.random_state + self.n_estimators - ) - ) - def _get_bootstrap_n_samples(self, n_samples: int) -> int: if self.max_samples is None: return n_samples diff --git a/odte/_version.py b/odte/_version.py index 260c070..f9aa3e1 100644 --- a/odte/_version.py +++ b/odte/_version.py @@ -1 +1 @@ -__version__ = "0.3.1" +__version__ = "0.3.2" diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index f19f063..01ca3bc 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -54,20 +54,6 @@ def test_initialize_max_feature(self): self.assertListEqual(expected, list(computed)) # print(f"{list(computed)},") - def test_initialize_random(self): - expected = [37, 235, 908] - tclf = Odte(random_state=self._random_state) - box = tclf._initialize_random() - computed = box.randint(0, 1000, 3) - self.assertListEqual(expected, computed.tolist()) - # test None - tclf = Odte(random_state=None) - box = tclf._initialize_random() - computed = box.randint(101, 1000, 3) - for value in computed.tolist(): - self.assertGreaterEqual(value, 101) - self.assertLessEqual(value, 1000) - def test_bogus_max_features(self): values = ["duck", -0.1, 0.0] for max_features in values: @@ -124,7 +110,7 @@ def test_predict(self): def test_score(self): X, y = load_dataset(self._random_state) - expected = 0.9513333333333334 + expected = 0.9533333333333334 tclf = Odte( random_state=self._random_state, max_features=None, @@ -136,19 +122,18 @@ def test_score(self): def test_score_splitter_max_features(self): X, y = load_dataset(self._random_state, n_features=16, n_samples=500) results = [ - 0.948, - 0.924, - 0.926, - 0.94, - 0.932, - 0.936, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, + 0.958, # best auto + 0.942, # random auto + 0.932, # trandom auto + 0.95, # mutual auto + 0.944, # iwss auto + 0.946, # cfs auto + 0.97, # best None + 0.97, # random None + 0.97, # trandom None + 0.97, # mutual None + 0.97, # iwss None + 0.97, # cfs None ] random.seed(self._random_state) for max_features in ["auto", None]: @@ -207,16 +192,25 @@ def test_nodes_leaves_depth(self): tclf = Odte( base_estimator=Stree(), random_state=self._random_state, - n_estimators=3, + n_estimators=5, + n_jobs=1, + ) + tclf_p = Odte( + base_estimator=Stree(), + random_state=self._random_state, + n_estimators=5, + n_jobs=-1, ) X, y = load_dataset(self._random_state, n_features=16, n_samples=500) tclf.fit(X, y) - self.assertAlmostEqual(6.0, tclf.depth_) - self.assertAlmostEqual(9.333333333333334, tclf.leaves_) - self.assertAlmostEqual(17.666666666666668, tclf.nodes_) - nodes, leaves = tclf.nodes_leaves() - self.assertAlmostEqual(9.333333333333334, leaves) - self.assertAlmostEqual(17.666666666666668, nodes) + tclf_p.fit(X, y) + for clf in [tclf, tclf_p]: + self.assertAlmostEqual(5.8, clf.depth_) + self.assertAlmostEqual(9.4, clf.leaves_) + self.assertAlmostEqual(17.8, clf.nodes_) + nodes, leaves = clf.nodes_leaves() + self.assertAlmostEqual(9.4, leaves) + self.assertAlmostEqual(17.8, nodes) def test_nodes_leaves_SVC(self): tclf = Odte( @@ -257,3 +251,15 @@ def test_base_estimator_hyperparams(self): def test_version(self): tclf = Odte() self.assertEqual(__version__, tclf.version()) + + def test_parallel_score(self): + tclf_p = Odte( + n_jobs=-1, random_state=self._random_state, n_estimators=30 + ) + tclf_s = Odte( + n_jobs=1, random_state=self._random_state, n_estimators=30 + ) + X, y = load_dataset(self._random_state, n_features=56, n_samples=1500) + tclf_p.fit(X, y) + tclf_s.fit(X, y) + self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))