Skip to content

Commit

Permalink
Merge pull request #6 from Doctorado-ML/parallel_init
Browse files Browse the repository at this point in the history
Parallel init error
  • Loading branch information
rmontanana authored Mar 2, 2022
2 parents aff96bb + dda3517 commit 98cadc7
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 69 deletions.
58 changes: 25 additions & 33 deletions odte/Odte.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"""
from __future__ import annotations
import random
import sys
import json
from math import factorial
from typing import Union, Optional, Tuple, List, Set
Expand All @@ -16,6 +15,7 @@
check_classification_targets,
)
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
from sklearn.utils import check_random_state
from sklearn.ensemble import BaseEnsemble # type: ignore
from sklearn.utils.validation import ( # type: ignore
check_is_fitted,
Expand Down Expand Up @@ -54,12 +54,6 @@ def __init__(
def version() -> str:
return __version__

def _initialize_random(self) -> np.random.mtrand.RandomState:
if self.random_state is None:
self.random_state = random.randint(0, sys.maxsize)
return np.random.mtrand._rand
return np.random.RandomState(self.random_state)

def _validate_estimator(self) -> None:
"""Check the estimator and set the base_estimator_ attribute."""
super()._validate_estimator(
Expand Down Expand Up @@ -109,13 +103,34 @@ def _compute_metrics(self) -> None:
self.leaves_ = tleaves / self.n_estimators
self.nodes_ = tnodes / self.n_estimators

def _train(
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
estimator = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
estimator,
X,
y,
weights,
random_seed,
boot_samples,
self.max_features_,
self.be_hyperparams,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
)

@staticmethod
def _parallel_build_tree(
base_estimator_: Stree,
base_estimator_: BaseEstimator,
X: np.ndarray,
y: np.ndarray,
weights: np.ndarray,
random_box: np.random.mtrand.RandomState,
random_seed: int,
boot_samples: int,
max_features: int,
Expand All @@ -127,6 +142,7 @@ def _parallel_build_tree(
clf.set_params(**hyperparams_)
n_samples = X.shape[0]
# bootstrap
random_box = check_random_state(random_seed)
indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples
weights_update = np.bincount(indices, minlength=n_samples)
Expand All @@ -138,30 +154,6 @@ def _parallel_build_tree(
clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
return (clf, features)

def _train(
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
random_box = self._initialize_random()
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
clf = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
clf,
X,
y,
weights,
random_box,
random_seed,
boot_samples,
self.max_features_,
self.be_hyperparams,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
)

def _get_bootstrap_n_samples(self, n_samples: int) -> int:
if self.max_samples is None:
return n_samples
Expand Down
2 changes: 1 addition & 1 deletion odte/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.1"
__version__ = "0.3.2"
76 changes: 41 additions & 35 deletions odte/tests/Odte_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,6 @@ def test_initialize_max_feature(self):
self.assertListEqual(expected, list(computed))
# print(f"{list(computed)},")

def test_initialize_random(self):
expected = [37, 235, 908]
tclf = Odte(random_state=self._random_state)
box = tclf._initialize_random()
computed = box.randint(0, 1000, 3)
self.assertListEqual(expected, computed.tolist())
# test None
tclf = Odte(random_state=None)
box = tclf._initialize_random()
computed = box.randint(101, 1000, 3)
for value in computed.tolist():
self.assertGreaterEqual(value, 101)
self.assertLessEqual(value, 1000)

def test_bogus_max_features(self):
values = ["duck", -0.1, 0.0]
for max_features in values:
Expand Down Expand Up @@ -124,7 +110,7 @@ def test_predict(self):

def test_score(self):
X, y = load_dataset(self._random_state)
expected = 0.9513333333333334
expected = 0.9533333333333334
tclf = Odte(
random_state=self._random_state,
max_features=None,
Expand All @@ -136,19 +122,18 @@ def test_score(self):
def test_score_splitter_max_features(self):
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
results = [
0.948,
0.924,
0.926,
0.94,
0.932,
0.936,
0.962,
0.962,
0.962,
0.962,
0.962,
0.962,
0.962,
0.958, # best auto
0.942, # random auto
0.932, # trandom auto
0.95, # mutual auto
0.944, # iwss auto
0.946, # cfs auto
0.97, # best None
0.97, # random None
0.97, # trandom None
0.97, # mutual None
0.97, # iwss None
0.97, # cfs None
]
random.seed(self._random_state)
for max_features in ["auto", None]:
Expand Down Expand Up @@ -207,16 +192,25 @@ def test_nodes_leaves_depth(self):
tclf = Odte(
base_estimator=Stree(),
random_state=self._random_state,
n_estimators=3,
n_estimators=5,
n_jobs=1,
)
tclf_p = Odte(
base_estimator=Stree(),
random_state=self._random_state,
n_estimators=5,
n_jobs=-1,
)
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
tclf.fit(X, y)
self.assertAlmostEqual(6.0, tclf.depth_)
self.assertAlmostEqual(9.333333333333334, tclf.leaves_)
self.assertAlmostEqual(17.666666666666668, tclf.nodes_)
nodes, leaves = tclf.nodes_leaves()
self.assertAlmostEqual(9.333333333333334, leaves)
self.assertAlmostEqual(17.666666666666668, nodes)
tclf_p.fit(X, y)
for clf in [tclf, tclf_p]:
self.assertAlmostEqual(5.8, clf.depth_)
self.assertAlmostEqual(9.4, clf.leaves_)
self.assertAlmostEqual(17.8, clf.nodes_)
nodes, leaves = clf.nodes_leaves()
self.assertAlmostEqual(9.4, leaves)
self.assertAlmostEqual(17.8, nodes)

def test_nodes_leaves_SVC(self):
tclf = Odte(
Expand Down Expand Up @@ -257,3 +251,15 @@ def test_base_estimator_hyperparams(self):
def test_version(self):
tclf = Odte()
self.assertEqual(__version__, tclf.version())

def test_parallel_score(self):
tclf_p = Odte(
n_jobs=-1, random_state=self._random_state, n_estimators=30
)
tclf_s = Odte(
n_jobs=1, random_state=self._random_state, n_estimators=30
)
X, y = load_dataset(self._random_state, n_features=56, n_samples=1500)
tclf_p.fit(X, y)
tclf_s.fit(X, y)
self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))

0 comments on commit 98cadc7

Please sign in to comment.