From cd7c7f3938291991c23f90936570021eb2339452 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Tue, 22 Feb 2022 20:40:35 +0100
Subject: [PATCH 1/5] First try to fix initialization issue

---
 odte/Odte.py             | 73 +++++++++++++++++++++-------------------
 odte/_version.py         |  2 +-
 odte/tests/Odte_tests.py | 12 +++++++
 3 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/odte/Odte.py b/odte/Odte.py
index 036be0a..4b2809f 100644
--- a/odte/Odte.py
+++ b/odte/Odte.py
@@ -26,6 +26,35 @@
 from ._version import __version__
 
 
+def _parallel_build_tree(
+    base_estimator_: Stree,
+    X: np.ndarray,
+    y: np.ndarray,
+    weights: np.ndarray,
+    random_box: np.random.mtrand.RandomState,
+    random_seed: int,
+    boot_samples: int,
+    max_features: int,
+    hyperparams: str,
+) -> Tuple[BaseEstimator, Tuple[int, ...]]:
+    clf = base_estimator_
+    hyperparams_ = json.loads(hyperparams)
+    hyperparams_.update(dict(random_state=random_seed))
+    clf.set_params(**hyperparams_)
+    n_samples = X.shape[0]
+    # bootstrap
+    indices = random_box.randint(0, n_samples, boot_samples)
+    # update weights with the chosen samples
+    weights_update = np.bincount(indices, minlength=n_samples)
+    current_weights = weights * weights_update
+    # random subspace
+    features = Odte._get_random_subspace(X, y, max_features)
+    # train the classifier
+    bootstrap = X[indices, :]
+    clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
+    return (clf, features)
+
+
 class Odte(BaseEnsemble, ClassifierMixin):
     def __init__(
         self,
@@ -109,45 +138,18 @@ def _compute_metrics(self) -> None:
         self.leaves_ = tleaves / self.n_estimators
         self.nodes_ = tnodes / self.n_estimators
 
-    @staticmethod
-    def _parallel_build_tree(
-        base_estimator_: Stree,
-        X: np.ndarray,
-        y: np.ndarray,
-        weights: np.ndarray,
-        random_box: np.random.mtrand.RandomState,
-        random_seed: int,
-        boot_samples: int,
-        max_features: int,
-        hyperparams: str,
-    ) -> Tuple[BaseEstimator, Tuple[int, ...]]:
-        clf = clone(base_estimator_)
-        hyperparams_ = json.loads(hyperparams)
-        hyperparams_.update(dict(random_state=random_seed))
-        clf.set_params(**hyperparams_)
-        n_samples = X.shape[0]
-        # bootstrap
-        indices = random_box.randint(0, n_samples, boot_samples)
-        # update weights with the chosen samples
-        weights_update = np.bincount(indices, minlength=n_samples)
-        current_weights = weights * weights_update
-        # random subspace
-        features = Odte._get_random_subspace(X, y, max_features)
-        # train the classifier
-        bootstrap = X[indices, :]
-        clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
-        return (clf, features)
-
     def _train(
         self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
     ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
         random_box = self._initialize_random()
         n_samples = X.shape[0]
         boot_samples = self._get_bootstrap_n_samples(n_samples)
-        clf = clone(self.base_estimator_)
+        estimator = []
+        for i in range(self.n_estimators):
+            estimator.append(clone(self.base_estimator_))
         return Parallel(n_jobs=self.n_jobs, prefer="threads")(  # type: ignore
-            delayed(Odte._parallel_build_tree)(
-                clf,
+            delayed(_parallel_build_tree)(
+                estimator[i],
                 X,
                 y,
                 weights,
@@ -157,8 +159,11 @@ def _train(
                 self.max_features_,
                 self.be_hyperparams,
             )
-            for random_seed in range(
-                self.random_state, self.random_state + self.n_estimators
+            for random_seed, i in zip(
+                range(
+                    self.random_state, self.random_state + self.n_estimators
+                ),
+                range(self.n_estimators),
             )
         )
 
diff --git a/odte/_version.py b/odte/_version.py
index 260c070..f9aa3e1 100644
--- a/odte/_version.py
+++ b/odte/_version.py
@@ -1 +1 @@
-__version__ = "0.3.1"
+__version__ = "0.3.2"
diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py
index f19f063..21abf07 100644
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@@ -257,3 +257,15 @@ def test_base_estimator_hyperparams(self):
     def test_version(self):
         tclf = Odte()
         self.assertEqual(__version__, tclf.version())
+
+    def test_parallel_score(self):
+        tclf_p = Odte(
+            n_jobs=-1, random_state=self._random_state, n_estimators=30
+        )
+        tclf_s = Odte(
+            n_jobs=1, random_state=self._random_state, n_estimators=30
+        )
+        X, y = load_dataset(self._random_state, n_features=56, n_samples=1500)
+        tclf_p.fit(X, y)
+        tclf_s.fit(X, y)
+        self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))

From 3766886190396cf6916fa8ba690ba26f79136a03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Wed, 23 Feb 2022 12:02:59 +0100
Subject: [PATCH 2/5] Fix np.random initialization

---
 odte/Odte.py             | 19 +++--------
 odte/tests/Odte_tests.py | 68 +++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/odte/Odte.py b/odte/Odte.py
index 4b2809f..875da85 100644
--- a/odte/Odte.py
+++ b/odte/Odte.py
@@ -16,6 +16,7 @@
     check_classification_targets,
 )
 from sklearn.base import clone, BaseEstimator, ClassifierMixin  # type: ignore
+from sklearn.utils import check_random_state
 from sklearn.ensemble import BaseEnsemble  # type: ignore
 from sklearn.utils.validation import (  # type: ignore
     check_is_fitted,
@@ -31,7 +32,6 @@ def _parallel_build_tree(
     X: np.ndarray,
     y: np.ndarray,
     weights: np.ndarray,
-    random_box: np.random.mtrand.RandomState,
     random_seed: int,
     boot_samples: int,
     max_features: int,
@@ -43,6 +43,7 @@ def _parallel_build_tree(
     clf.set_params(**hyperparams_)
     n_samples = X.shape[0]
     # bootstrap
+    random_box = check_random_state(random_seed)
     indices = random_box.randint(0, n_samples, boot_samples)
     # update weights with the chosen samples
     weights_update = np.bincount(indices, minlength=n_samples)
@@ -83,12 +84,6 @@ def __init__(
     def version() -> str:
         return __version__
 
-    def _initialize_random(self) -> np.random.mtrand.RandomState:
-        if self.random_state is None:
-            self.random_state = random.randint(0, sys.maxsize)
-            return np.random.mtrand._rand
-        return np.random.RandomState(self.random_state)
-
     def _validate_estimator(self) -> None:
         """Check the estimator and set the base_estimator_ attribute."""
         super()._validate_estimator(
@@ -141,7 +136,7 @@ def _compute_metrics(self) -> None:
     def _train(
         self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
     ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
-        random_box = self._initialize_random()
+        # np.random.RandomState(seed)
         n_samples = X.shape[0]
         boot_samples = self._get_bootstrap_n_samples(n_samples)
         estimator = []
@@ -153,17 +148,13 @@ def _train(
                 X,
                 y,
                 weights,
-                random_box,
                 random_seed,
                 boot_samples,
                 self.max_features_,
                 self.be_hyperparams,
             )
-            for random_seed, i in zip(
-                range(
-                    self.random_state, self.random_state + self.n_estimators
-                ),
-                range(self.n_estimators),
+            for i, random_seed in enumerate(
+                range(self.random_state, self.random_state + self.n_estimators)
             )
         )
 
diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py
index 21abf07..9945016 100644
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@@ -54,20 +54,6 @@ def test_initialize_max_feature(self):
             self.assertListEqual(expected, list(computed))
             # print(f"{list(computed)},")
 
-    def test_initialize_random(self):
-        expected = [37, 235, 908]
-        tclf = Odte(random_state=self._random_state)
-        box = tclf._initialize_random()
-        computed = box.randint(0, 1000, 3)
-        self.assertListEqual(expected, computed.tolist())
-        # test None
-        tclf = Odte(random_state=None)
-        box = tclf._initialize_random()
-        computed = box.randint(101, 1000, 3)
-        for value in computed.tolist():
-            self.assertGreaterEqual(value, 101)
-            self.assertLessEqual(value, 1000)
-
     def test_bogus_max_features(self):
         values = ["duck", -0.1, 0.0]
         for max_features in values:
@@ -124,7 +110,7 @@ def test_predict(self):
 
     def test_score(self):
         X, y = load_dataset(self._random_state)
-        expected = 0.9513333333333334
+        expected = 0.9533333333333334
         tclf = Odte(
             random_state=self._random_state,
             max_features=None,
@@ -136,19 +122,18 @@ def test_score(self):
     def test_score_splitter_max_features(self):
         X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
         results = [
-            0.948,
-            0.924,
-            0.926,
-            0.94,
-            0.932,
-            0.936,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
+            0.958,  # best auto
+            0.942,  # random auto
+            0.932,  # trandom auto
+            0.95,  # mutual auto
+            0.944,  # iwss auto
+            0.946,  # cfs auto
+            0.97,  # best None
+            0.97,  # random None
+            0.97,  # trandom None
+            0.97,  # mutual None
+            0.97,  # iwss None
+            0.97,  # cfs None
         ]
         random.seed(self._random_state)
         for max_features in ["auto", None]:
@@ -208,15 +193,32 @@ def test_nodes_leaves_depth(self):
             base_estimator=Stree(),
             random_state=self._random_state,
             n_estimators=3,
+            n_jobs=1,
+        )
+        X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
+        tclf.fit(X, y)
+        self.assertAlmostEqual(6.333333333333333, tclf.depth_)
+        self.assertAlmostEqual(10.0, tclf.leaves_)
+        self.assertAlmostEqual(19.0, tclf.nodes_)
+        nodes, leaves = tclf.nodes_leaves()
+        self.assertAlmostEqual(10.0, leaves)
+        self.assertAlmostEqual(19, nodes)
+
+    def test_nodes_leaves_depth_parallel(self):
+        tclf = Odte(
+            base_estimator=Stree(),
+            random_state=self._random_state,
+            n_estimators=3,
+            n_jobs=-1,
         )
         X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
         tclf.fit(X, y)
-        self.assertAlmostEqual(6.0, tclf.depth_)
-        self.assertAlmostEqual(9.333333333333334, tclf.leaves_)
-        self.assertAlmostEqual(17.666666666666668, tclf.nodes_)
+        self.assertAlmostEqual(6.333333333333333, tclf.depth_)
+        self.assertAlmostEqual(10.0, tclf.leaves_)
+        self.assertAlmostEqual(19.0, tclf.nodes_)
         nodes, leaves = tclf.nodes_leaves()
-        self.assertAlmostEqual(9.333333333333334, leaves)
-        self.assertAlmostEqual(17.666666666666668, nodes)
+        self.assertAlmostEqual(10.0, leaves)
+        self.assertAlmostEqual(19, nodes)
 
     def test_nodes_leaves_SVC(self):
         tclf = Odte(

From 9e5fe8c79138de669e400d3cbb890ec3b4ca616a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Wed, 23 Feb 2022 12:10:12 +0100
Subject: [PATCH 3/5] Fix flake req. remove uneeded sys mod

---
 odte/Odte.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/odte/Odte.py b/odte/Odte.py
index 875da85..a396da7 100644
--- a/odte/Odte.py
+++ b/odte/Odte.py
@@ -7,7 +7,6 @@
 """
 from __future__ import annotations
 import random
-import sys
 import json
 from math import factorial
 from typing import Union, Optional, Tuple, List, Set

From 877c24f3f4e528c419f7ddbb706ea91a87ce7ee2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Fri, 25 Feb 2022 19:24:44 +0100
Subject: [PATCH 4/5] fix rc1

---
 odte/Odte.py | 71 +++++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/odte/Odte.py b/odte/Odte.py
index a396da7..8c9c059 100644
--- a/odte/Odte.py
+++ b/odte/Odte.py
@@ -26,35 +26,6 @@
 from ._version import __version__
 
 
-def _parallel_build_tree(
-    base_estimator_: Stree,
-    X: np.ndarray,
-    y: np.ndarray,
-    weights: np.ndarray,
-    random_seed: int,
-    boot_samples: int,
-    max_features: int,
-    hyperparams: str,
-) -> Tuple[BaseEstimator, Tuple[int, ...]]:
-    clf = base_estimator_
-    hyperparams_ = json.loads(hyperparams)
-    hyperparams_.update(dict(random_state=random_seed))
-    clf.set_params(**hyperparams_)
-    n_samples = X.shape[0]
-    # bootstrap
-    random_box = check_random_state(random_seed)
-    indices = random_box.randint(0, n_samples, boot_samples)
-    # update weights with the chosen samples
-    weights_update = np.bincount(indices, minlength=n_samples)
-    current_weights = weights * weights_update
-    # random subspace
-    features = Odte._get_random_subspace(X, y, max_features)
-    # train the classifier
-    bootstrap = X[indices, :]
-    clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
-    return (clf, features)
-
-
 class Odte(BaseEnsemble, ClassifierMixin):
     def __init__(
         self,
@@ -135,15 +106,12 @@ def _compute_metrics(self) -> None:
     def _train(
         self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
     ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
-        # np.random.RandomState(seed)
         n_samples = X.shape[0]
         boot_samples = self._get_bootstrap_n_samples(n_samples)
-        estimator = []
-        for i in range(self.n_estimators):
-            estimator.append(clone(self.base_estimator_))
+        estimator = clone(self.base_estimator_)
         return Parallel(n_jobs=self.n_jobs, prefer="threads")(  # type: ignore
-            delayed(_parallel_build_tree)(
-                estimator[i],
+            delayed(Odte._parallel_build_tree)(
+                estimator,
                 X,
                 y,
                 weights,
@@ -152,11 +120,40 @@ def _train(
                 self.max_features_,
                 self.be_hyperparams,
             )
-            for i, random_seed in enumerate(
-                range(self.random_state, self.random_state + self.n_estimators)
+            for random_seed in range(
+                self.random_state, self.random_state + self.n_estimators
             )
         )
 
+    @staticmethod
+    def _parallel_build_tree(
+        base_estimator_: BaseEstimator,
+        X: np.ndarray,
+        y: np.ndarray,
+        weights: np.ndarray,
+        random_seed: int,
+        boot_samples: int,
+        max_features: int,
+        hyperparams: str,
+    ) -> Tuple[BaseEstimator, Tuple[int, ...]]:
+        clf = clone(base_estimator_)
+        hyperparams_ = json.loads(hyperparams)
+        hyperparams_.update(dict(random_state=random_seed))
+        clf.set_params(**hyperparams_)
+        n_samples = X.shape[0]
+        # bootstrap
+        random_box = check_random_state(random_seed)
+        indices = random_box.randint(0, n_samples, boot_samples)
+        # update weights with the chosen samples
+        weights_update = np.bincount(indices, minlength=n_samples)
+        current_weights = weights * weights_update
+        # random subspace
+        features = Odte._get_random_subspace(X, y, max_features)
+        # train the classifier
+        bootstrap = X[indices, :]
+        clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
+        return (clf, features)
+
     def _get_bootstrap_n_samples(self, n_samples: int) -> int:
         if self.max_samples is None:
             return n_samples

From dda3517090460f0f33730f40d49c3e76ec3884b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Sat, 26 Feb 2022 11:30:12 +0100
Subject: [PATCH 5/5] merge two tests parallel-sequential

---
 odte/tests/Odte_tests.py | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py
index 9945016..01ca3bc 100644
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@@ -192,33 +192,25 @@ def test_nodes_leaves_depth(self):
         tclf = Odte(
             base_estimator=Stree(),
             random_state=self._random_state,
-            n_estimators=3,
+            n_estimators=5,
             n_jobs=1,
         )
-        X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
-        tclf.fit(X, y)
-        self.assertAlmostEqual(6.333333333333333, tclf.depth_)
-        self.assertAlmostEqual(10.0, tclf.leaves_)
-        self.assertAlmostEqual(19.0, tclf.nodes_)
-        nodes, leaves = tclf.nodes_leaves()
-        self.assertAlmostEqual(10.0, leaves)
-        self.assertAlmostEqual(19, nodes)
-
-    def test_nodes_leaves_depth_parallel(self):
-        tclf = Odte(
+        tclf_p = Odte(
             base_estimator=Stree(),
             random_state=self._random_state,
-            n_estimators=3,
+            n_estimators=5,
             n_jobs=-1,
         )
         X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
         tclf.fit(X, y)
-        self.assertAlmostEqual(6.333333333333333, tclf.depth_)
-        self.assertAlmostEqual(10.0, tclf.leaves_)
-        self.assertAlmostEqual(19.0, tclf.nodes_)
-        nodes, leaves = tclf.nodes_leaves()
-        self.assertAlmostEqual(10.0, leaves)
-        self.assertAlmostEqual(19, nodes)
+        tclf_p.fit(X, y)
+        for clf in [tclf, tclf_p]:
+            self.assertAlmostEqual(5.8, clf.depth_)
+            self.assertAlmostEqual(9.4, clf.leaves_)
+            self.assertAlmostEqual(17.8, clf.nodes_)
+            nodes, leaves = clf.nodes_leaves()
+            self.assertAlmostEqual(9.4, leaves)
+            self.assertAlmostEqual(17.8, nodes)
 
     def test_nodes_leaves_SVC(self):
         tclf = Odte(