From 064ffa4077803ec6f4bfdc9e40dd96d737fd17a2 Mon Sep 17 00:00:00 2001 From: Gareth Jones Date: Fri, 22 May 2020 18:42:32 +0100 Subject: [PATCH] V0.4 (#7) * Typo * General tidy * Address sonar issues * Address sonar issues * Update requirements * General tidy --- README.md | 13 +- incremental_trees/__init__.py | 2 +- incremental_trees/add_ins/__init__.py | 0 .../add_ins/classifier_additions.py | 27 + .../add_ins/classifier_overloads.py | 65 ++ incremental_trees/add_ins/forest_additions.py | 83 +++ incremental_trees/add_ins/forest_overloads.py | 46 ++ .../add_ins/regressor_additions.py | 9 + .../add_ins/regressor_overloads.py | 8 + .../add_ins/sklearn_overloads.py | 46 ++ incremental_trees/models/__init__.py | 0 .../models/classification/__init__.py | 0 .../models/classification/streaming_extc.py | 74 +++ .../models/classification/streaming_rfc.py | 92 +++ .../models/regression/__init__.py | 0 .../models/regression/streaming_extr.py | 65 ++ .../models/regression/streaming_rfr.py | 65 ++ incremental_trees/trees.py | 565 +----------------- requirements.txt | 13 +- setup.py | 6 +- tests/common/__init__.py | 0 tests/{data.py => common/data_fixture.py} | 5 +- tests/{params.py => common/param_fixtures.py} | 0 tests/integration/base.py | 3 +- .../incremental_trees/test_trees.py | 103 ++-- .../test_trees_benchmarks.py | 36 +- .../incremental_trees/test_trees_dask.py | 37 +- .../incremental_trees/test_trees_grids.py | 178 +++--- .../test_trees_inconsistent_classes.py | 189 +++--- 29 files changed, 895 insertions(+), 835 deletions(-) create mode 100644 incremental_trees/add_ins/__init__.py create mode 100644 incremental_trees/add_ins/classifier_additions.py create mode 100644 incremental_trees/add_ins/classifier_overloads.py create mode 100644 incremental_trees/add_ins/forest_additions.py create mode 100644 incremental_trees/add_ins/forest_overloads.py create mode 100644 incremental_trees/add_ins/regressor_additions.py create mode 100644 incremental_trees/add_ins/regressor_overloads.py create mode 100644 incremental_trees/add_ins/sklearn_overloads.py create mode 100644 incremental_trees/models/__init__.py create mode 100644 incremental_trees/models/classification/__init__.py create mode 100644 incremental_trees/models/classification/streaming_extc.py create mode 100644 incremental_trees/models/classification/streaming_rfc.py create mode 100644 incremental_trees/models/regression/__init__.py create mode 100644 incremental_trees/models/regression/streaming_extr.py create mode 100644 incremental_trees/models/regression/streaming_rfr.py create mode 100644 tests/common/__init__.py rename tests/{data.py => common/data_fixture.py} (98%) rename tests/{params.py => common/param_fixtures.py} (100%) diff --git a/README.md b/README.md index 5d5f773..7fa730a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Incremental trees v0.3.3 +# Incremental trees v0.4.0 ![The overcomplicated tests are...](https://github.com/garethjns/IncrementalTrees/workflows/The%20overcomplicated%20tests%20are.../badge.svg) Adds partial fit method to sklearn's forest estimators (currently RandomForestClassifier/Regressor and ExtraTreesClassifier/Regressor) to allow [incremental training](https://scikit-learn.org/0.15/modules/scaling_strategies.html) without being limited to a linear model. Works with or without [Dask-ml's Incremental](http://ml.dask.org/incremental.html). @@ -13,10 +13,7 @@ Quick start: 1) Clone repo and build pip installable package. ````bash - git clone https://github.com/garethjns/IncrementalTrees.git - python -m pip install --upgrade pip setuptools wheel - cd IncrementalTrees - pip install . + pip install incremental_trees ```` @@ -39,7 +36,7 @@ Feeds .partial_fit() with randomly samples rows. ````python import numpy as np from sklearn.datasets import make_blobs -from incremental_trees.trees import StreamingRFC +from incremental_trees.models.classification.streaming_rfc import StreamingRFC # Generate some data in memory x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40, @@ -65,7 +62,7 @@ import dask_ml.datasets from dask_ml.wrappers import Incremental from dask.distributed import Client, LocalCluster from dask import delayed -from incremental_trees.trees import StreamingRFC +from incremental_trees.models.classification.streaming_rfc import StreamingRFC # Generate some data out-of-core x, y = dask_ml.datasets.make_blobs(n_samples=2e5, chunks=1e4, random_state=0, @@ -138,6 +135,8 @@ srfc = StreamingRFC(n_estimators_per_chunk=1, ```` # Version history +## v0.4 + - Refactor and tidy, try with new versions of Dask/sklearn ## v0.3.1-3 - Update Dask versions ## v0.3 diff --git a/incremental_trees/__init__.py b/incremental_trees/__init__.py index 03174f4..222c11c 100644 --- a/incremental_trees/__init__.py +++ b/incremental_trees/__init__.py @@ -1 +1 @@ -__version__ = '0.3.3' \ No newline at end of file +__version__ = '0.4.0' \ No newline at end of file diff --git a/incremental_trees/add_ins/__init__.py b/incremental_trees/add_ins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/incremental_trees/add_ins/classifier_additions.py b/incremental_trees/add_ins/classifier_additions.py new file mode 100644 index 0000000..b6f0661 --- /dev/null +++ b/incremental_trees/add_ins/classifier_additions.py @@ -0,0 +1,27 @@ +from typing import List + +import numpy as np + +from incremental_trees.add_ins.forest_additions import ForestAdditions +from incremental_trees.add_ins.sklearn_overloads import _check_partial_fit_first_call + + +class ClassifierAdditions(ForestAdditions): + """ + Additional functions specific to classifiers. + """ + + def _check_classes(self, classes: List[int]): + """Set classes if they haven't been set yet, otherwise do nothing.""" + + # Set classes for forest (this only needs to be done once). + # Not for each individual tree, these will be set by .fit() using the classes available in the subset. + # Check classes_ is set, or provided + # Returns false if nothing to do + classes_need_setting = _check_partial_fit_first_call(self, classes) + + # If classes not set, set + # Above will error if not set and classes = None + if classes_need_setting: + self.classes_ = np.array(classes) + self.n_classes_ = len(classes) diff --git a/incremental_trees/add_ins/classifier_overloads.py b/incremental_trees/add_ins/classifier_overloads.py new file mode 100644 index 0000000..eb0920d --- /dev/null +++ b/incremental_trees/add_ins/classifier_overloads.py @@ -0,0 +1,65 @@ +import warnings +from typing import Union + +import numpy as np +import pandas as pd + +from incremental_trees.add_ins.forest_overloads import ForestOverloads + + +class ClassifierOverloads(ForestOverloads): + """ + Overloaded methods specific to classifiers. + """ + + def predict_proba(self, x: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: + """ + Call each predict proba from tree, and accumulate. This handle possibly inconsistent shapes, but isn't + parallel? + ​ + Cases where not all classes are presented in the first or subsequent subsets needs to be + handled. For the RandomForestClassifier, tree predictions are averaged in + sklearn.ensemble.forest.accumulate_prediction function. This sums the output matrix with dimensions + n rows x n classes and fails if the class dimension differs. + The class dimension is defined at the individual estimator level during the .fit() call, which sets the + following attributes: + - self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit() + to set: + - self.classes_ + - self.n_classes_ + + The .predict() method (sklearn.tree.tree.BaseDecisionTree.predict()) sets the output shape using: + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + return self.classes_.take(np.argmax(proba, axis=1), axis=0) + else: + [Not considering this yet] + + :param x: + :return: + """ + # Prepare expected output shape + preds = np.zeros(shape=(x.shape[0], self.n_classes_), + dtype=np.float32) + counts = np.zeros(shape=(x.shape[0], self.n_classes_), + dtype=np.int16) + + for e in self.estimators_: + # Get the prediction from the tree + est_preds = e.predict_proba(x) + # Get the indexes of the classes present + present_classes = e.classes_.astype(int) + # Sum these in to the correct array columns + preds[:, present_classes] += est_preds + counts[:, present_classes] += 1 + + # Normalise predictions against counts + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + norm_prob = preds / counts + + # And remove nans (0/0) and infs (n/0) + norm_prob[np.isnan(norm_prob) | np.isinf(norm_prob)] = 0 + + return norm_prob diff --git a/incremental_trees/add_ins/forest_additions.py b/incremental_trees/add_ins/forest_additions.py new file mode 100644 index 0000000..e0a49f4 --- /dev/null +++ b/incremental_trees/add_ins/forest_additions.py @@ -0,0 +1,83 @@ +import time +from typing import Union + +import numpy as np +import pandas as pd + + +class ForestAdditions: + def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], + classes: Union[list, np.ndarray] = None): + """ + Fit a single DTC using the given subset of x and y. +​ + This calls .fit, which is overloaded. However flags pf_call=True, so .fit() will handle calling super .fit(). +​ + For classifiers; + - First call needs to be supplied with the expected classes (similar to existing models with .partial_fit()) + in case not all classes are present in the first subset. + + This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs + depending on the data available in the subset. The predict_proba method is modified to standardise shape to the + dimensions defined in this object. + + For regressors: + - self._check_classes is overloaded with dummy method. +​ + :param x: + :param y: + :return: + """ + if self.verbose > 1: + print(f"PF Call with set classes: " + f"{getattr(self, 'classes_', '[no classes attr]')} and input classes {classes}") + + self._check_classes(classes=classes) + + # Fit the next estimator, if not done + if self._fit_estimators < self.max_n_estimators: + t0 = time.time() + self.fit(X, y, + pf_call=True, + classes_=getattr(self, 'classes_', None)) # Pass classes for enforcement, if classifier. + t1 = time.time() + + if self.verbose > 1: + print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} " + f"/ {self.max_n_estimators}") + print(f"Model reports {len(self.estimators_)}") + print(f"Fit time: {round(t1 - t0, 2)}") + print(len(self.estimators_)) + self._fit_estimators += self.n_estimators_per_chunk + + # If still not done, prep to fit next + if self._fit_estimators < self.max_n_estimators: + self.n_estimators += self.n_estimators_per_chunk + + else: + if self.verbose > 0: + print('Done') + + return self + + def _sampled_partial_fit(self, + x: Union[np.array, pd.DataFrame], y: [np.ndarray, pd.Series]): + """ + This feeds partial_fit with random samples based on the spf_ parameters. Used by .fit() when not using dask. + :param x: Data. + :param y: Labels. + :return: + """ + + n_samples = int(self.spf_sample_prop * x.shape[0]) + + for _ in range(self.spf_n_fits): + idx = np.random.randint(0, x.shape[0], n_samples) + + if self.verbose > 0: + print(f"_sampled_partial_fit size: {idx.shape}") + + self.partial_fit(x[idx, :], y[idx], + classes=np.unique(y)) + + return self diff --git a/incremental_trees/add_ins/forest_overloads.py b/incremental_trees/add_ins/forest_overloads.py new file mode 100644 index 0000000..e746fee --- /dev/null +++ b/incremental_trees/add_ins/forest_overloads.py @@ -0,0 +1,46 @@ +import numpy as np + + +class ForestOverloads: + def set_params(self, + **kwargs): + """ + Ensure warm_Start is set to true, otherwise set other params as usual. + + :param kwargs: Params to set. + """ + # Warm start should be true to get .fit() to keep existing estimators. + kwargs['warm_start'] = True + + for key, value in kwargs.items(): + setattr(self, key, value) + + return self + + def fit(self, *args, + pf_call: bool = False, + classes_: np.ndarray = None): + """ + This fit handles calling either super().fit or partial_fit depending on the caller. + + :param pf_call: True if called from partial fit, in this case super.fit() is called, instead of getting stuck in + a recursive loop. + :param classes_: On pf calls, classes is passed from self.classes which will have already been set. These are + re-set after the call to super's fit, which will change them based on observed data. + """ + + if not self.dask_feeding and not pf_call: + if self.verbose > 0: + print('Feeding with spf') + self._sampled_partial_fit(*args) + + else: + + if self.verbose > 0: + print('Fitting from a partial_fit call') + super().fit(*args) + if classes_ is not None: + self.classes_ = classes_ + self.n_classes_ = len(classes_) + + return self diff --git a/incremental_trees/add_ins/regressor_additions.py b/incremental_trees/add_ins/regressor_additions.py new file mode 100644 index 0000000..7018f89 --- /dev/null +++ b/incremental_trees/add_ins/regressor_additions.py @@ -0,0 +1,9 @@ +from incremental_trees.add_ins.forest_additions import ForestAdditions + + +class RegressorAdditions(ForestAdditions): + def _check_classes(self, **kwargs) -> None: + """ + Don't need to check classes with the regressor. + """ + pass diff --git a/incremental_trees/add_ins/regressor_overloads.py b/incremental_trees/add_ins/regressor_overloads.py new file mode 100644 index 0000000..020c3cc --- /dev/null +++ b/incremental_trees/add_ins/regressor_overloads.py @@ -0,0 +1,8 @@ +from incremental_trees.add_ins.forest_overloads import ForestOverloads + + +class RegressorOverloads(ForestOverloads): + """ + Nothing specific to overload for the Regressors. Predict doesn't need to deal with classes. + """ + pass diff --git a/incremental_trees/add_ins/sklearn_overloads.py b/incremental_trees/add_ins/sklearn_overloads.py new file mode 100644 index 0000000..fd2ca7c --- /dev/null +++ b/incremental_trees/add_ins/sklearn_overloads.py @@ -0,0 +1,46 @@ +import warnings + +import numpy as np +from sklearn.utils.multiclass import unique_labels + + +def _check_partial_fit_first_call(clf, + classes=None): + """ + Modified sklearn function. If classes are inconsistent on second call, warn and reuse previous + on assumption first call specification was correct. Don't raise error. + + Private helper function for factorizing common classes param logic + + Estimators that implement the ``partial_fit`` API need to be provided with + the list of possible classes at the first call to partial_fit. + + Modification: + Subsequent calls to partial_fit do not check () that ``classes`` is still + consistent with a previous value of ``clf.classes_`` when provided. + + This function returns True if it detects that this was the first call to + ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also + set on ``clf``. + """ + + if getattr(clf, 'classes_', None) is None and classes is None: + raise ValueError("classes must be passed on the first call " + "to partial_fit.") + + elif classes is not None: + if getattr(clf, 'classes_', None) is not None: + if not np.array_equal(clf.classes_, unique_labels(classes)): + # Don't error here: + # Instead, use the previous classes setting, which must be correct on first setting + warnings.warn(f"Classes differ on this call, ignoring on the assumption first call was correct.") + return False + + else: + # This is the first call to partial_fit + clf.classes_ = unique_labels(classes) + return True + + # classes is None and clf.classes_ has already previously been set: + # nothing to do + return False diff --git a/incremental_trees/models/__init__.py b/incremental_trees/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/incremental_trees/models/classification/__init__.py b/incremental_trees/models/classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/incremental_trees/models/classification/streaming_extc.py b/incremental_trees/models/classification/streaming_extc.py new file mode 100644 index 0000000..9f82686 --- /dev/null +++ b/incremental_trees/models/classification/streaming_extc.py @@ -0,0 +1,74 @@ +import numpy as np +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.tree import ExtraTreeClassifier + +from incremental_trees.add_ins.classifier_additions import ClassifierAdditions +from incremental_trees.add_ins.classifier_overloads import ClassifierOverloads + + +class StreamingEXTC(ClassifierAdditions, ClassifierOverloads, ExtraTreesClassifier): + """Overload sklearn.ensemble.ExtraTreesClassifier to add partial fit method and new params.""" + + def __init__(self, + n_estimators_per_chunk: int = 1, + n_estimators: bool = None, + max_n_estimators=np.inf, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0., + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0., + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=True, + class_weight=None, + dask_feeding: bool = True, + spf_n_fits=100, + spf_sample_prop: float = 0.1): + super(ExtraTreesClassifier, self).__init__( + base_estimator=ExtraTreeClassifier(), + n_estimators=n_estimators_per_chunk, + estimator_params=("criterion", "max_depth", "min_samples_split", + "min_samples_leaf", "min_weight_fraction_leaf", + "max_features", "max_leaf_nodes", + "min_impurity_decrease", "min_impurity_split", + "random_state"), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + class_weight=class_weight) + + self.max_n_estimators: int = None + self._fit_estimators: int = 0 + self.classes_: np.array = None # NB: Needs to be array, not list. + self.n_classes_: int = None + + self._fit_estimators = 0 + self.max_n_estimators = max_n_estimators + self.n_estimators_per_chunk = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + # Set additional params. + self.set_params(n_estimators_per_chunk=n_estimators_per_chunk, + max_n_estimators=max_n_estimators, + spf_n_fits=spf_n_fits, + spf_sample_prop=spf_sample_prop, + dask_feeding=dask_feeding) diff --git a/incremental_trees/models/classification/streaming_rfc.py b/incremental_trees/models/classification/streaming_rfc.py new file mode 100644 index 0000000..54199b3 --- /dev/null +++ b/incremental_trees/models/classification/streaming_rfc.py @@ -0,0 +1,92 @@ +import warnings + +import numpy as np +from sklearn.ensemble import RandomForestClassifier + +from incremental_trees.add_ins.classifier_additions import ClassifierAdditions +from incremental_trees.add_ins.classifier_overloads import ClassifierOverloads + + +class StreamingRFC(ClassifierAdditions, ClassifierOverloads, RandomForestClassifier): + """ + Overload sklearn.ensemble.RandomForestClassifier to add partial fit method and new params. + + Note this init is a slightly different structure to ExtraTressClassifier/Regressor and RandomForestRegressor. + """ + + def __init__(self, + bootstrap=True, + class_weight=None, + criterion='gini', + max_depth=None, + max_features='auto', + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + min_samples_leaf=1, + min_samples_split=2, + min_weight_fraction_leaf=0.0, + n_estimators_per_chunk: int = 1, + n_jobs=None, + oob_score=False, + random_state=None, + verbose=0, + warm_start: bool = True, + dask_feeding: bool = True, + max_n_estimators=10, + spf_n_fits=100, + spf_sample_prop=0.1) -> None: + """ + :param bootstrap: + :param class_weight: + :param criterion: + :param max_depth: + :param max_features: + :param max_leaf_nodes: + :param min_impurity_decrease: + :param min_impurity_split: + :param min_samples_leaf: + :param min_samples_split: + :param min_weight_fraction_leaf: + :param n_estimators_per_chunk: Estimators per chunk to fit. + :param n_jobs: + :param oob_score: + :param random_state: + :param verbose: + :param warm_start: + :param max_n_estimators: Total max number of estimators to fit. + :param verb: If > 0 display debugging info during fit + """ + + # Run the super init, which also calls other parent inits to handle other params (like base estimator) + super().__init__() + + self.max_n_estimators: int = None + self._fit_estimators: int = 0 + self.classes_: np.array = None # NB: Needs to be array, not list. + self.n_classes_: int = None + + self.set_params(bootstrap=bootstrap, + class_weight=class_weight, + criterion=criterion, + max_depth=max_depth, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + min_impurity_split=min_impurity_split, + min_samples_leaf=min_samples_leaf, + min_samples_split=min_samples_split, + min_weight_fraction_leaf=min_weight_fraction_leaf, + n_estimators_per_chunk=n_estimators_per_chunk, + n_estimators=n_estimators_per_chunk, + n_jobs=n_jobs, + oob_score=oob_score, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + _fit_estimators=0, + dask_feeding=dask_feeding, + max_n_estimators=max_n_estimators, + verb=0, + spf_n_fits=spf_n_fits, + spf_sample_prop=spf_sample_prop) diff --git a/incremental_trees/models/regression/__init__.py b/incremental_trees/models/regression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/incremental_trees/models/regression/streaming_extr.py b/incremental_trees/models/regression/streaming_extr.py new file mode 100644 index 0000000..6d565d2 --- /dev/null +++ b/incremental_trees/models/regression/streaming_extr.py @@ -0,0 +1,65 @@ +import numpy as np +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.tree import ExtraTreeRegressor + +from incremental_trees.add_ins.regressor_additions import RegressorAdditions +from incremental_trees.add_ins.regressor_overloads import RegressorOverloads + + +class StreamingEXTR(RegressorAdditions, RegressorOverloads, ExtraTreesRegressor): + def __init__(self, + n_estimators_per_chunk: int = 1, + n_estimators='warn', + max_n_estimators=np.inf, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0., + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0., + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=True, + dask_feeding: bool = True, + spf_n_fits: int = 100, + spf_sample_prop: float = 0.1): + super(ExtraTreesRegressor, self).__init__( + base_estimator=ExtraTreeRegressor(), + n_estimators=n_estimators_per_chunk, + estimator_params=("criterion", "max_depth", "min_samples_split", + "min_samples_leaf", "min_weight_fraction_leaf", + "max_features", "max_leaf_nodes", + "min_impurity_decrease", "min_impurity_split", + "random_state"), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start) + + self._fit_estimators = 0 + self.max_n_estimators = max_n_estimators + self.n_estimators_per_chunk = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + # Set additional params. + self.set_params(n_estimators_per_chunk=n_estimators_per_chunk, + max_n_estimators=max_n_estimators, + spf_n_fits=spf_n_fits, + spf_sample_prop=spf_sample_prop, + dask_feeding=dask_feeding) diff --git a/incremental_trees/models/regression/streaming_rfr.py b/incremental_trees/models/regression/streaming_rfr.py new file mode 100644 index 0000000..a028c93 --- /dev/null +++ b/incremental_trees/models/regression/streaming_rfr.py @@ -0,0 +1,65 @@ +from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeRegressor + +from incremental_trees.add_ins.regressor_additions import RegressorAdditions +from incremental_trees.add_ins.regressor_overloads import RegressorOverloads + + +class StreamingRFR(RegressorAdditions, RegressorOverloads, RandomForestRegressor): + """Overload sklearn.ensemble.RandomForestClassifier to add partial fit method and new params.""" + + def __init__(self, + n_estimators='warn', + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0., + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0., + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + n_estimators_per_chunk: int = 1, + warm_start: bool = True, + dask_feeding: bool = True, + max_n_estimators=10, + spf_n_fits=100, + spf_sample_prop=0.1): + super(RandomForestRegressor, self).__init__( + base_estimator=DecisionTreeRegressor(), + n_estimators=n_estimators_per_chunk, + estimator_params=("criterion", "max_depth", "min_samples_split", + "min_samples_leaf", "min_weight_fraction_leaf", + "max_features", "max_leaf_nodes", + "min_impurity_decrease", "min_impurity_split", + "random_state"), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start) + + self._fit_estimators = 0 + self.max_n_estimators = max_n_estimators + self.n_estimators_per_chunk = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + # Set additional params. + self.set_params(n_estimators_per_chunk=n_estimators_per_chunk, + spf_n_fits=spf_n_fits, + spf_sample_prop=spf_sample_prop, + dask_feeding=dask_feeding) diff --git a/incremental_trees/trees.py b/incremental_trees/trees.py index d92539e..4d29adf 100644 --- a/incremental_trees/trees.py +++ b/incremental_trees/trees.py @@ -1,553 +1,12 @@ -import time -import warnings -from typing import Union, List - import numpy as np -import pandas as pd -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor -from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor, DecisionTreeRegressor -from sklearn.utils.multiclass import unique_labels - - -def _check_partial_fit_first_call(clf, - classes=None): - """ - Modified sklearn function. If classes are inconsistent on second call, warn and reuse previous - on assumption first call specification was correct. Don't raise error. - - Private helper function for factorizing common classes param logic - - Estimators that implement the ``partial_fit`` API need to be provided with - the list of possible classes at the first call to partial_fit. - - Modification: - Subsequent calls to partial_fit do not check () that ``classes`` is still - consistent with a previous value of ``clf.classes_`` when provided. - - This function returns True if it detects that this was the first call to - ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also - set on ``clf``. - """ - - if getattr(clf, 'classes_', None) is None and classes is None: - raise ValueError("classes must be passed on the first call " - "to partial_fit.") - - elif classes is not None: - if getattr(clf, 'classes_', None) is not None: - if not np.array_equal(clf.classes_, unique_labels(classes)): - # Don't error here: - # Instead, use the previous classes setting, which must be correct on first setting - warnings.warn(f"Classes differ on this call, ignoring on the assumption first call was correct.") - return False - # raise ValueError( - # "`classes=%r` is not the same as on last call " - # "to partial_fit, was: %r" % (classes, clf.classes_)) - - else: - # This is the first call to partial_fit - clf.classes_ = unique_labels(classes) - return True - - # classes is None and clf.classes_ has already previously been set: - # nothing to do - return False - - -class ForestAdditions: - def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], - classes: Union[list, np.ndarray] = None): - """ - Fit a single DTC using the given subset of x and y. -​ - This calls .fit, which is overloaded. However flags pf_call=True, so .fit() will handle calling super .fit(). -​ - For classifiers; - - First call needs to be supplied with the expected classes (similar to existing models with .partial_fit()) - in case not all classes are present in the first subset. - - This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs - depending on the data available in the subset. The predict_proba method is modified to standardise shape to the - dimensions defined in this object. - - For regressors: - - self._check_classes is overloaded with dummy method. -​ - :param x: - :param y: - :return: - """ - if self.verbose > 1: - print(f"PF Call with set classes: " - f"{getattr(self, 'classes_', '[no classes attr]')} and input classes {classes}") - - self._check_classes(classes=classes) - - # Fit the next estimator, if not done - if self._fit_estimators < self.max_n_estimators: - t0 = time.time() - self.fit(X, y, - pf_call=True, - classes_=getattr(self, 'classes_', None)) # Pass classes for enforcement, if classifier. - t1 = time.time() - - if self.verbose > 1: - print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} " - f"/ {self.max_n_estimators}") - print(f"Model reports {len(self.estimators_)}") - print(f"Fit time: {round(t1 - t0, 2)}") - print(len(self.estimators_)) - self._fit_estimators += self.n_estimators_per_chunk - - # If still not done, prep to fit next - if self._fit_estimators < self.max_n_estimators: - self.n_estimators += self.n_estimators_per_chunk - - else: - if self.verbose > 0: - print('Done') - - return self - - def _sampled_partial_fit(self, - x: Union[np.array, pd.DataFrame], y: [np.ndarray, pd.Series]): - """ - This feeds partial_fit with random samples based on the spf_ parameters. Used by .fit() when not using dask. - :param x: Data. - :param y: Labels. - :return: - """ - - n_samples = int(self.spf_sample_prop * x.shape[0]) - - for _ in range(self.spf_n_fits): - idx = np.random.randint(0, x.shape[0], n_samples) - - if self.verbose > 0: - print(f"_sampled_partial_fit size: {idx.shape}") - - self.partial_fit(x[idx, :], y[idx], - classes=np.unique(y)) - - return self - - -class ClassifierAdditions(ForestAdditions): - """ - Additional functions specific to classifiers. - """ - - def _check_classes(self, classes: List[int]): - """Set classes if they haven't been set yet, otherwise do nothing.""" - - # Set classes for forest (this only needs to be done once). - # Not for each individual tree, these will be set by .fit() using the classes available in the subset. - # Check classes_ is set, or provided - # Returns false if nothing to do - classes_need_setting = _check_partial_fit_first_call(self, classes) - - # If classes not set, set - # Above will error if not set and classes = None - if classes_need_setting: - self.classes_ = np.array(classes) - self.n_classes_ = len(classes) - - -class RegressorAdditions(ForestAdditions): - def _check_classes(self, **kwargs) -> None: - """ - Don't need to check classes with the regressor. - """ - pass - - -class ForestOverloads: - def set_params(self, - **kwargs): - """ - Ensure warm_Start is set to true, otherwise set other params as usual. - - :param kwargs: Params to set. - """ - # Warm start should be true to get .fit() to keep existing estimators. - kwargs['warm_start'] = True - - for key, value in kwargs.items(): - setattr(self, key, value) - - return self - - def fit(self, *args, - pf_call: bool = False, - classes_: np.ndarray = None): - """ - This fit handles calling either super().fit or partial_fit depending on the caller. - - :param pf_call: True if called from partial fit, in this case super.fit() is called, instead of getting stuck in - a recursive loop. - :param classes_: On pf calls, classes is passed from self.classes which will have already been set. These are - re-set after the call to super's fit, which will change them based on observed data. - """ - if not self.dask_feeding and not pf_call: - if self.verbose > 0: - print('Feeding with spf') - self._sampled_partial_fit(*args) +from incremental_trees.models.classification.streaming_extc import StreamingEXTC +from incremental_trees.models.classification.streaming_rfc import StreamingRFC +from incremental_trees.models.regression.streaming_extr import StreamingEXTR +from incremental_trees.models.regression.streaming_rfr import StreamingRFR - else: - if self.verbose > 0: - print('Fitting from a partial_fit call') - super().fit(*args) - if classes_ is not None: - self.classes_ = classes_ - self.n_classes_ = len(classes_) - - return self - - -class ClassifierOverloads(ForestOverloads): - """ - Overloaded methods specific to classifiers. - """ - - def predict_proba(self, x: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: - """ - Call each predict proba from tree, and accumulate. This handle possibly inconsistent shapes, but isn't - parallel? - ​ - Cases where not all classes are presented in the first or subsequent subsets needs to be - handled. For the RandomForestClassifier, tree predictions are averaged in - sklearn.ensemble.forest.accumulate_prediction function. This sums the output matrix with dimensions - n rows x n classes and fails if the class dimension differs. - The class dimension is defined at the individual estimator level during the .fit() call, which sets the - following attributes: - - self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit() - to set: - - self.classes_ - - self.n_classes_ - - The .predict() method (sklearn.tree.tree.BaseDecisionTree.predict()) sets the output shape using: - # Classification - if is_classifier(self): - if self.n_outputs_ == 1: - return self.classes_.take(np.argmax(proba, axis=1), axis=0) - else: - [Not considering this yet] - - :param x: - :return: - """ - # Prepare expected output shape - preds = np.zeros(shape=(x.shape[0], self.n_classes_), - dtype=np.float32) - counts = np.zeros(shape=(x.shape[0], self.n_classes_), - dtype=np.int16) - - for e in self.estimators_: - # Get the prediction from the tree - est_preds = e.predict_proba(x) - # Get the indexes of the classes present - present_classes = e.classes_.astype(int) - # Sum these in to the correct array columns - preds[:, present_classes] += est_preds - counts[:, present_classes] += 1 - - # Normalise predictions against counts - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - norm_prob = preds / counts - - # And remove nans (0/0) and infs (n/0) - norm_prob[np.isnan(norm_prob) | np.isinf(norm_prob)] = 0 - - return norm_prob - - -class RegressorOverloads(ForestOverloads): - """ - Nothing specific to overload for the Regressors. Predict doesn't need to deal with classes. - """ - pass - - -class StreamingRFR(RegressorAdditions, RegressorOverloads, RandomForestRegressor): - """Overload sklearn.ensemble.RandomForestClassifier to add partial fit method and new params.""" - - def __init__(self, - n_estimators='warn', - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - n_estimators_per_chunk: int = 1, - warm_start: bool = True, - dask_feeding: bool = True, - max_n_estimators=10, - spf_n_fits=100, - spf_sample_prop=0.1): - super(RandomForestRegressor, self).__init__( - base_estimator=DecisionTreeRegressor(), - n_estimators=n_estimators_per_chunk, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "min_impurity_split", - "random_state"), - bootstrap=bootstrap, - oob_score=oob_score, - n_jobs=n_jobs, - random_state=random_state, - verbose=verbose, - warm_start=warm_start) - - self._fit_estimators = 0 - self.max_n_estimators = max_n_estimators - self.n_estimators_per_chunk = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf - self.max_features = max_features - self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease - self.min_impurity_split = min_impurity_split - - # Set additional params. - self.set_params(n_estimators_per_chunk=n_estimators_per_chunk, - spf_n_fits=spf_n_fits, - spf_sample_prop=spf_sample_prop, - dask_feeding=dask_feeding) - - -class StreamingRFC(ClassifierAdditions, ClassifierOverloads, RandomForestClassifier): - """ - Overload sklearn.ensemble.RandomForestClassifier to add partial fit method and new params. - - Note this init is a slightly different structure to ExtraTressClassifier/Regressor and RandomForestRegressor. - """ - - def __init__(self, - bootstrap=True, - class_weight=None, - criterion='gini', - max_depth=None, - max_features='auto', - max_leaf_nodes=None, - min_impurity_decrease=0.0, - min_impurity_split=None, - min_samples_leaf=1, - min_samples_split=2, - min_weight_fraction_leaf=0.0, - n_estimators_per_chunk: int = 1, - n_estimators: bool = None, - n_jobs=None, - oob_score=False, - random_state=None, - verbose=0, - warm_start: bool = True, - dask_feeding: bool = True, - max_n_estimators=10, - spf_n_fits=100, - spf_sample_prop=0.1) -> None: - """ - :param bootstrap: - :param class_weight: - :param criterion: - :param max_depth: - :param max_features: - :param max_leaf_nodes: - :param min_impurity_decrease: - :param min_impurity_split: - :param min_samples_leaf: - :param min_samples_split: - :param min_weight_fraction_leaf: - :param n_estimators_per_chunk: Estimators per chunk to fit. - :param n_jobs: - :param oob_score: - :param random_state: - :param verbose: - :param warm_start: - :param max_n_estimators: Total max number of estimators to fit. - :param verb: If > 0 display debugging info during fit - """ - - # Run the super init, which also calls other parent inits to handle other params (like base estimator) - super().__init__() - - self.max_n_estimators: int = None - self._fit_estimators: int = 0 - self.classes_: np.array = None # NB: Needs to be array, not list. - self.n_classes_: int = None - - # n_estimators will be used by RFC to set how many ests are fit on each .fit() call - n_estimators = n_estimators_per_chunk - - self.set_params(bootstrap=bootstrap, - class_weight=class_weight, - criterion=criterion, - max_depth=max_depth, - max_features=max_features, - max_leaf_nodes=max_leaf_nodes, - min_impurity_decrease=min_impurity_decrease, - min_impurity_split=min_impurity_split, - min_samples_leaf=min_samples_leaf, - min_samples_split=min_samples_split, - min_weight_fraction_leaf=min_weight_fraction_leaf, - n_estimators_per_chunk=n_estimators_per_chunk, - n_estimators=n_estimators, - n_jobs=n_jobs, - oob_score=oob_score, - random_state=random_state, - verbose=verbose, - warm_start=warm_start, - _fit_estimators=0, - dask_feeding=dask_feeding, - max_n_estimators=max_n_estimators, - verb=0, - spf_n_fits=spf_n_fits, - spf_sample_prop=spf_sample_prop) - - -class StreamingEXTR(RegressorAdditions, RegressorOverloads, ExtraTreesRegressor): - def __init__(self, - n_estimators_per_chunk: int = 1, - n_estimators='warn', - max_n_estimators=np.inf, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=True, - dask_feeding: bool = True, - spf_n_fits: int = 100, - spf_sample_prop: float = 0.1): - super(ExtraTreesRegressor, self).__init__( - base_estimator=ExtraTreeRegressor(), - n_estimators=n_estimators_per_chunk, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "min_impurity_split", - "random_state"), - bootstrap=bootstrap, - oob_score=oob_score, - n_jobs=n_jobs, - random_state=random_state, - verbose=verbose, - warm_start=warm_start) - - self._fit_estimators = 0 - self.max_n_estimators = max_n_estimators - self.n_estimators_per_chunk = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf - self.max_features = max_features - self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease - self.min_impurity_split = min_impurity_split - - # Set additional params. - self.set_params(n_estimators_per_chunk=n_estimators_per_chunk, - max_n_estimators=max_n_estimators, - spf_n_fits=spf_n_fits, - spf_sample_prop=spf_sample_prop, - dask_feeding=dask_feeding) - - -class StreamingEXTC(ClassifierAdditions, ClassifierOverloads, ExtraTreesClassifier): - """Overload sklearn.ensemble.ExtraTreesClassifier to add partial fit method and new params.""" - - def __init__(self, - n_estimators_per_chunk: int = 1, - n_estimators: bool = None, - max_n_estimators=np.inf, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=True, - class_weight=None, - dask_feeding: bool = True, - spf_n_fits=100, - spf_sample_prop: float = 0.1): - super(ExtraTreesClassifier, self).__init__( - base_estimator=ExtraTreeClassifier(), - n_estimators=n_estimators_per_chunk, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "min_impurity_split", - "random_state"), - bootstrap=bootstrap, - oob_score=oob_score, - n_jobs=n_jobs, - random_state=random_state, - verbose=verbose, - warm_start=warm_start, - class_weight=class_weight) - - self.max_n_estimators: int = None - self._fit_estimators: int = 0 - self.classes_: np.array = None # NB: Needs to be array, not list. - self.n_classes_: int = None - - self._fit_estimators = 0 - self.max_n_estimators = max_n_estimators - self.n_estimators_per_chunk = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.min_weight_fraction_leaf = min_weight_fraction_leaf - self.max_features = max_features - self.max_leaf_nodes = max_leaf_nodes - self.min_impurity_decrease = min_impurity_decrease - self.min_impurity_split = min_impurity_split - - # Set additional params. - self.set_params(n_estimators_per_chunk=n_estimators_per_chunk, - max_n_estimators=max_n_estimators, - spf_n_fits=spf_n_fits, - spf_sample_prop=spf_sample_prop, - dask_feeding=dask_feeding) - - -if __name__ == '__main__': +def bunch_of_examples(): from sklearn.datasets import make_blobs, make_regression x, y = make_regression(n_samples=int(2e5), @@ -560,6 +19,8 @@ def __init__(self, verbose=0, n_jobs=2) + srfr.fit(x, y) + # Fit 10 regressors for _ in range(10): x, y = make_regression(n_samples=int(2e5), @@ -572,7 +33,7 @@ def __init__(self, n_jobs=5) chunk_size = int(2e3) - for i in range(20): + for _ in range(20): sample_idx = np.random.randint(0, x.shape[0], chunk_size) srfr.partial_fit(x[sample_idx], y[sample_idx], classes=np.unique(y)) @@ -584,7 +45,7 @@ def __init__(self, verbose=0, n_jobs=5) - for i in range(20): + for _ in range(20): sample_idx = np.random.randint(0, x.shape[0], chunk_size) sext.partial_fit(x[sample_idx], y[sample_idx], classes=np.unique(y)) @@ -605,7 +66,7 @@ def __init__(self, n_jobs=5) chunk_size = int(2e3) - for i in range(20): + for _ in range(20): sample_idx = np.random.randint(0, x.shape[0], chunk_size) srfc.partial_fit(x[sample_idx], y[sample_idx], classes=np.unique(y)) @@ -617,9 +78,13 @@ def __init__(self, verbose=0, n_jobs=5) - for i in range(20): + for _ in range(20): sample_idx = np.random.randint(0, x.shape[0], chunk_size) sext.partial_fit(x[sample_idx], y[sample_idx], classes=np.unique(y)) print(f"SEXTC: {sext.score(x, y)}") + + +if __name__ == '__main__': + bunch_of_examples() diff --git a/requirements.txt b/requirements.txt index cd38d29..0ce12d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,14 @@ -scikit-learn==0.22.2 -pandas==1.0.1 -dask==2.12.0 +scikit-learn>=0.22 +pandas +numpy +dask>=2 dask-glm==0.2.0 -dask-ml==1.2.0 -distributed==2.12.0 +dask-ml>=1 +distributed>=2 bokeh pytest jupyter jupyterlab ipykernel matplotlib -fsspec>=0.3.3 +fsspec diff --git a/setup.py b/setup.py index 2201664..60af819 100644 --- a/setup.py +++ b/setup.py @@ -14,8 +14,8 @@ long_description_content_type="text/markdown", packages=setuptools.find_packages(), url="https://github.com/garethjns/IncrementalTrees", - install_requires=["scikit-learn==0.22.2", "pandas==1.0.1", - "dask==2.12.0", + install_requires=["scikit-learn>=0.22", "pandas", + "dask>=2", "dask-glm==0.2.0", - "dask-ml==1.2.0", + "dask-ml>=1", "bokeh"]) diff --git a/tests/common/__init__.py b/tests/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data.py b/tests/common/data_fixture.py similarity index 98% rename from tests/data.py rename to tests/common/data_fixture.py index 791fb9d..68bbd61 100644 --- a/tests/data.py +++ b/tests/common/data_fixture.py @@ -1,10 +1,10 @@ from sklearn.datasets import make_blobs from sklearn.metrics import roc_auc_score -from sklearn.model_selection import train_test_split from sklearn.metrics.classification import classification_report +from sklearn.model_selection import train_test_split -class Data: +class DataFixture: def _prep_data(self): x, y = make_blobs(n_samples=int(2e4), random_state=0, @@ -18,7 +18,6 @@ def _prep_data(self): return self def _mod_report(self, mod): - report = classification_report(self.y_test, mod.predict(self.x_test)) train_auc = roc_auc_score(self.y_train, mod.predict_proba(self.x_train)[:, 1]) test_auc = roc_auc_score(self.y_test, mod.predict_proba(self.x_test)[:, 1]) diff --git a/tests/params.py b/tests/common/param_fixtures.py similarity index 100% rename from tests/params.py rename to tests/common/param_fixtures.py diff --git a/tests/integration/base.py b/tests/integration/base.py index b39e250..80135b2 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -1,7 +1,6 @@ -import dask_ml -from dask_ml.datasets import make_blobs, make_regression import numpy as np import sklearn +from dask_ml.datasets import make_blobs, make_regression from distributed import LocalCluster, Client from sklearn import clone from sklearn.model_selection import RandomizedSearchCV diff --git a/tests/integration/incremental_trees/test_trees.py b/tests/integration/incremental_trees/test_trees.py index d0b179b..ad7c2d9 100644 --- a/tests/integration/incremental_trees/test_trees.py +++ b/tests/integration/incremental_trees/test_trees.py @@ -4,16 +4,20 @@ import sklearn import sklearn.datasets -from incremental_trees.trees import StreamingRFC, StreamingRFR, StreamingEXTC, StreamingEXTR +from incremental_trees.models.classification.streaming_extc import StreamingEXTC +from incremental_trees.models.regression.streaming_extr import StreamingEXTR +from incremental_trees.models.regression.streaming_rfr import StreamingRFR +from incremental_trees.trees import StreamingRFC from tests.integration.base import PartialFitTests, FitTests -class TestStreamingRFC_1(PartialFitTests, unittest.TestCase): +class TestStreamingRFC1(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset. No limit on the total number of trees. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -35,12 +39,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFC_2(PartialFitTests, unittest.TestCase): +class TestStreamingRFC2(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset. Total models limited to 39. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -61,12 +66,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFC_3(PartialFitTests, unittest.TestCase): +class TestStreamingRFC3(PartialFitTests, unittest.TestCase): """ Test SRFC with multiple estimators per chunk with "random forest style" max features. ie, subset. No limit on total models, 3 estimators per row subset (each with different feature subset) """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -88,12 +94,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFC_4(PartialFitTests, unittest.TestCase): +class TestStreamingRFC4(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 1 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -115,12 +122,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFC_5(PartialFitTests, unittest.TestCase): +class TestStreamingRFC5(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -143,12 +151,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFC_6(PartialFitTests, unittest.TestCase): +class TestStreamingRFC6(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -171,7 +180,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreaming_RFC_7(FitTests, unittest.TestCase): +class TestStreamingRFC7(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -188,11 +197,10 @@ def setUpClass(cls): spf_sample_prop=cls.spf_sample_prop, spf_n_fits=cls.spf_n_fits) - super().setUpClass() -class TestStreaming_RFC_8(FitTests, unittest.TestCase): +class TestStreamingRFC8(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -208,11 +216,10 @@ def setUpClass(cls): spf_sample_prop=cls.spf_sample_prop, spf_n_fits=cls.spf_n_fits) - super().setUpClass() -class TestStreaming_RFC_9(FitTests, unittest.TestCase): +class TestStreamingRFC9(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -232,12 +239,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_1(PartialFitTests, unittest.TestCase): +class TestStreamingRFR1(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset. No limit on the total number of trees. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -257,12 +265,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_2(PartialFitTests, unittest.TestCase): +class TestStreamingRFR2(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "random forest style" max features. ie, subset. Total models limited to 39. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -281,12 +290,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_3(PartialFitTests, unittest.TestCase): +class TestStreamingRFR3(PartialFitTests, unittest.TestCase): """ Test SRFC with multiple estimators per chunk with "random forest style" max features. ie, subset. No limit on total models, 3 estimators per row subset (each with different feature subset) """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -306,12 +316,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_4(PartialFitTests, unittest.TestCase): +class TestStreamingRFR4(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 1 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -331,12 +342,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_5(PartialFitTests, unittest.TestCase): +class TestStreamingRFR5(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -357,12 +369,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_6(PartialFitTests, unittest.TestCase): +class TestStreamingRFR6(PartialFitTests, unittest.TestCase): """ Test SRFC with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -383,7 +396,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_7(FitTests, unittest.TestCase): +class TestStreamingRFR7(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -403,7 +416,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_8(FitTests, unittest.TestCase): +class TestStreamingRFR8(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -422,7 +435,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingRFR_9(FitTests, unittest.TestCase): +class TestStreamingRFR9(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -442,12 +455,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_1(PartialFitTests, unittest.TestCase): +class TestStreamingEXTC1(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset. No limit on the total number of trees. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -468,12 +482,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_2(PartialFitTests, unittest.TestCase): +class TestStreamingEXTC2(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset. Total models limited to 39. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -494,12 +509,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_3(PartialFitTests, unittest.TestCase): +class TestStreamingEXTC3(PartialFitTests, unittest.TestCase): """ Test SEXT with multiple estimators per chunk with "random forest style" max features. ie, subset. No limit on total models, 3 estimators per row subset (each with different feature subset) """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -521,12 +537,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_4(PartialFitTests, unittest.TestCase): +class TestStreamingEXTC4(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 1 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -548,12 +565,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_5(PartialFitTests, unittest.TestCase): +class TestStreamingEXTC5(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -576,12 +594,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_6(PartialFitTests, unittest.TestCase): +class TestStreamingEXTC6(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -604,7 +623,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_7(FitTests, unittest.TestCase): +class TestStreamingEXTC7(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -624,7 +643,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_8(FitTests, unittest.TestCase): +class TestStreamingEXTC8(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -643,7 +662,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTC_9(FitTests, unittest.TestCase): +class TestStreamingEXTC9(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -663,12 +682,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_1(PartialFitTests, unittest.TestCase): +class TestStreamingEXTR1(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset. No limit on the total number of trees. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -687,12 +707,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_2(PartialFitTests, unittest.TestCase): +class TestStreamingEXTR2(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "random forest style" max features. ie, subset. Total models limited to 39. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -711,12 +732,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_3(PartialFitTests, unittest.TestCase): +class TestStreamingEXTR3(PartialFitTests, unittest.TestCase): """ Test SEXT with multiple estimators per chunk with "random forest style" max features. ie, subset. No limit on total models, 3 estimators per row subset (each with different feature subset) """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -736,12 +758,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_4(PartialFitTests, unittest.TestCase): +class TestStreamingEXTR4(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 1 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -761,12 +784,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_5(PartialFitTests, unittest.TestCase): +class TestStreamingEXTR5(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -787,12 +811,13 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_6(PartialFitTests, unittest.TestCase): +class TestStreamingEXTR6(PartialFitTests, unittest.TestCase): """ Test SEXT with single estimator per chunk with "decision tree style" max features. ie, all available to each tree. No limit on total models, 3 estimators per row subset. """ + @classmethod def setUpClass(cls): """Set up model to test.""" @@ -813,7 +838,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_7(FitTests, unittest.TestCase): +class TestStreamingEXTR7(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -833,7 +858,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_8(FitTests, unittest.TestCase): +class TestStreamingEXTR8(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -852,7 +877,7 @@ def setUpClass(cls): super().setUpClass() -class TestStreamingEXTR_9(FitTests, unittest.TestCase): +class TestStreamingEXTR9(FitTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" diff --git a/tests/integration/incremental_trees/test_trees_benchmarks.py b/tests/integration/incremental_trees/test_trees_benchmarks.py index dc4f0cc..33e9b7f 100644 --- a/tests/integration/incremental_trees/test_trees_benchmarks.py +++ b/tests/integration/incremental_trees/test_trees_benchmarks.py @@ -3,20 +3,21 @@ import numpy as np from distributed import LocalCluster, Client from sklearn import clone -from sklearn.linear_model import LogisticRegression from sklearn.ensemble.forest import RandomForestClassifier +from sklearn.linear_model import LogisticRegression from incremental_trees.trees import StreamingRFC -from tests.data import Data +from tests.common.data_fixture import DataFixture -class PerformanceComparisons(Data): +class PerformanceComparisons(DataFixture): """ Compare srfc to benchmark rfc and logistic regression. TODO: Generalise naming, report functions. TODO: Set sensible parameters and add performance assets in child tests. """ + @classmethod def setUpClass(cls): """Prepare comparable models""" @@ -55,7 +56,6 @@ def _assert_same_n_rows(self): Assert that given the training settings, the rfc and srfc will, overall, see the same number of rows of data. This is a more direct comparison than len(mod.estimators_) as the individual estimators see much less data in the srfc case. - :return: """ # Will be available in actual test. n_rows = self.x_train.shape[0] @@ -66,8 +66,8 @@ def _assert_same_n_rows(self): int(n_rows / self.srfc_n_partial_fit_calls))) def _fit_srfc(self, - sequential: bool=True, - n_prop: float=0.1) -> StreamingRFC: + sequential: bool = True, + n_prop: float = 0.1) -> StreamingRFC: """ Fit the streaming RFC. Total number of rows used in training varies depending on sequential. @@ -100,7 +100,7 @@ def _fit_srfc(self, n_sample_rows = int(n_rows / self.srfc_n_partial_fit_calls) sidx = 0 eidx = n_sample_rows - for i in range(self.srfc_n_partial_fit_calls): + for _ in range(self.srfc_n_partial_fit_calls): idx = np.arange(sidx, eidx) srfc.partial_fit(self.x_train[idx, :], self.y_train[idx], classes=[0, 1]) @@ -109,7 +109,7 @@ def _fit_srfc(self, else: # Sample n_prop of data self.srfc_n_partial_fit_calls times n_sample_rows = int(n_rows * n_prop) - for i in range(self.srfc_n_partial_fit_calls): + for _ in range(self.srfc_n_partial_fit_calls): # Sample indexes with replacement idx = np.random.randint(0, n_rows, n_sample_rows) srfc.partial_fit(self.x_train[idx, :], self.y_train[idx], @@ -143,13 +143,6 @@ def test_benchmark_manual_random(self): print(f"self.rfc score test AUC: {self.rfc_test_auc}") print(f"self.srfc_sam score test AUC: {self.srfc_sam_test_auc}") - # self.assertTrue(math.isclose(self.srfc_sam_test_auc, self.rfc_test_auc, - # rel_tol=0.05)) - # self.assertTrue(math.isclose(self.srfc_sam_test_auc, self.log_reg_test_auc, - # rel_tol=0.05)) - - # self._assert_same_n_rows() - def test_benchmark_auto_spf(self): self._fit_with_spf() self.srfc_spf_report, self.srfc_spf_train_auc, self.srfc_spf_test_auc = self._mod_report(mod=self.srfc_spf) @@ -162,7 +155,7 @@ def test_benchmark_auto_spf(self): def test_benchmark_auto_dask(self): self._fit_with_dask() self.srfc_dask_report, self.srfc_dask_train_auc, self.srfc_dask_test_auc = \ - self._mod_report(mod=self.srfc_dask) + self._mod_report(mod=self.srfc_dask) print("==Auto feeding partial_fit with dask==") print(f"self.log_reg score test AUC: {self.log_reg_test_auc}") @@ -181,18 +174,11 @@ def test_benchmark_manual_sequential(self): print(f"self.rfc_once score test AUC: {self.rfc_once_test_auc}") print(f"self.srfc score test AUC: {self.srfc_seq_test_auc}") - # self.assertTrue(math.isclose(self.srfc_seq_test_auc, self.rfc_test_auc, - # rel_tol=0.05)) - # self.assertTrue(math.isclose(self.srfc_seq_test_auc, self.log_reg_test_auc, - # rel_tol=0.05)) - - # self._assert_same_n_rows() - def _generate_comparable_models(self, srfc_n_estimators_per_chunk: int, srfc_n_partial_fit_calls: int, srfc_sample_prop: float, - n_jobs: int=4): + n_jobs: int = 4): """ Set values for streaming models and different set ups. Create two comparable rfcs designed to see equivalent numbers of rows. @@ -242,7 +228,7 @@ def _generate_comparable_models(self, # "Auto-spf" srfc self.srfc_spf = StreamingRFC(dask_feeding=False, - n_estimators_per_chunk=self.srfc_n_estimators_per_chunk,\ + n_estimators_per_chunk=self.srfc_n_estimators_per_chunk, \ spf_n_fits=self.srfc_n_partial_fit_calls, spf_sample_prop=self.srfc_sample_prop, n_jobs=n_jobs) diff --git a/tests/integration/incremental_trees/test_trees_dask.py b/tests/integration/incremental_trees/test_trees_dask.py index 6b48129..123313e 100644 --- a/tests/integration/incremental_trees/test_trees_dask.py +++ b/tests/integration/incremental_trees/test_trees_dask.py @@ -3,11 +3,14 @@ import numpy as np from dask_ml.wrappers import Incremental -from incremental_trees.trees import StreamingEXTC, StreamingEXTR, StreamingRFC, StreamingRFR +from incremental_trees.models.classification.streaming_extc import StreamingEXTC +from incremental_trees.models.regression.streaming_extr import StreamingEXTR +from incremental_trees.models.regression.streaming_rfr import StreamingRFR +from incremental_trees.trees import StreamingRFC from tests.integration.base import DaskTests -class TestDaskModel_1(DaskTests, unittest.TestCase): +class TestDaskModel1(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -24,7 +27,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_2(DaskTests, unittest.TestCase): +class TestDaskModel2(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -41,7 +44,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_3(DaskTests, unittest.TestCase): +class TestDaskModel3(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -58,7 +61,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_4(DaskTests, unittest.TestCase): +class TestDaskModel4(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -76,7 +79,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_5(DaskTests, unittest.TestCase): +class TestDaskModel5(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -94,7 +97,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_6(DaskTests, unittest.TestCase): +class TestDaskModel6(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -112,7 +115,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_7(DaskTests, unittest.TestCase): +class TestDaskModel7(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -130,7 +133,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskModel_8(DaskTests, unittest.TestCase): +class TestDaskModel8(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -149,7 +152,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFC_1(DaskTests, unittest.TestCase): +class TestDaskRFC1(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -166,7 +169,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFC_2(DaskTests, unittest.TestCase): +class TestDaskRFC2(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -183,7 +186,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFC_3(DaskTests, unittest.TestCase): +class TestDaskRFC3(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -200,7 +203,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFC_4(DaskTests, unittest.TestCase): +class TestDaskRFC4(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -218,7 +221,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFR_1(DaskTests, unittest.TestCase): +class TestDaskRFR1(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -236,7 +239,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFR_2(DaskTests, unittest.TestCase): +class TestDaskRFR2(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -254,7 +257,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFR_3(DaskTests, unittest.TestCase): +class TestDaskRFR3(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" @@ -272,7 +275,7 @@ def setUpClass(cls): super().setUpClass() -class TestDaskRFR_4(DaskTests, unittest.TestCase): +class TestDaskRFR4(DaskTests, unittest.TestCase): @classmethod def setUpClass(cls): """Set up model to test.""" diff --git a/tests/integration/incremental_trees/test_trees_grids.py b/tests/integration/incremental_trees/test_trees_grids.py index e62117b..4a0ac0e 100644 --- a/tests/integration/incremental_trees/test_trees_grids.py +++ b/tests/integration/incremental_trees/test_trees_grids.py @@ -1,88 +1,90 @@ -# TODO: These tests aren't finished. Need to generalise, add EXTC, regressors, etc. - -import unittest -import warnings -from incremental_trees.trees import StreamingRFC, StreamingEXTC -from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier -from sklearn.model_selection import RandomizedSearchCV - -from tests.data import Data -from tests.params import RFCGRID, SRFCGRID - - -class GridBenchmarks: - def test_fit_all(self): - """ - Fit grids and compare. - - TODO: Generalise naming. - """ - - with warnings.catch_warnings(): - warnings.simplefilter('ignore', FutureWarning) - self.rfc_grid.fit(self.x_train, self.y_train) - self.srfc_grid.fit(self.x_train, self.y_train) - - self.rfc_report, self.rfc_train_auc, self.rfc_test_auc = self._mod_report(mod=self.rfc_grid.best_estimator_) - self.srfc_report, self.srfc_train_auc, self.srfc_test_auc = self._mod_report(mod=self.srfc_grid.best_estimator_) - - print("==Not-necessarily fair grid comparison==") - print(f"self.rfc grid score test AUC: {self.rfc_test_auc}") - print(f"self.srfc grid score test AUC: {self.srfc_test_auc}") - print(self.srfc_grid.get_params()) - - -class RFCBenchmarkGrid(GridBenchmarks, Data, unittest.TestCase): - """ - Check a grid runs, assert performance (not added yet) - """ - - @classmethod - def setUpClass(cls): - cls._prep_data(cls) - - n_iter = 3 - cls.srfc_grid = RandomizedSearchCV(StreamingRFC(n_jobs=2, - verbose=1), - param_distributions=SRFCGRID, - scoring='roc_auc', - n_iter=n_iter * 2, - verbose=2, - n_jobs=3, - cv=4) - - cls.rfc_grid = RandomizedSearchCV(RandomForestClassifier(n_jobs=2), - param_distributions=RFCGRID, - scoring='roc_auc', - n_iter=n_iter, - verbose=2, - n_jobs=3, - cv=4) - - -class EXTCBenchmarkGrid(GridBenchmarks, Data, unittest.TestCase): - """ - Check a grid runs, assert performance (not added yet) - """ - - @classmethod - def setUpClass(cls): - cls._prep_data(cls) - - n_iter = 2 - cls.srfc_grid = RandomizedSearchCV(StreamingEXTC(n_jobs=2, - verbose=1), - param_distributions=SRFCGRID, - scoring='roc_auc', - n_iter=n_iter * 10, - verbose=2, - n_jobs=3, - cv=4) - - cls.rfc_grid = RandomizedSearchCV(ExtraTreesClassifier(n_jobs=2), - param_distributions=RFCGRID, - scoring='roc_auc', - n_iter=n_iter, - verbose=2, - n_jobs=3, - cv=4) +# TODO: These tests aren't finished. Need to generalise, add EXTC, regressors, etc. + +import unittest +import warnings + +from sklearn.ensemble.forest import RandomForestClassifier, ExtraTreesClassifier +from sklearn.model_selection import RandomizedSearchCV + +from incremental_trees.models.classification.streaming_extc import StreamingEXTC +from incremental_trees.trees import StreamingRFC +from tests.common.data_fixture import DataFixture +from tests.common.param_fixtures import RFCGRID, SRFCGRID + + +class GridBenchmarks: + def test_fit_all(self): + """ + Fit grids and compare. + + TODO: Generalise naming. + """ + + with warnings.catch_warnings(): + warnings.simplefilter('ignore', FutureWarning) + self.rfc_grid.fit(self.x_train, self.y_train) + self.srfc_grid.fit(self.x_train, self.y_train) + + self.rfc_report, self.rfc_train_auc, self.rfc_test_auc = self._mod_report(mod=self.rfc_grid.best_estimator_) + self.srfc_report, self.srfc_train_auc, self.srfc_test_auc = self._mod_report(mod=self.srfc_grid.best_estimator_) + + print("==Not-necessarily fair grid comparison==") + print(f"self.rfc grid score test AUC: {self.rfc_test_auc}") + print(f"self.srfc grid score test AUC: {self.srfc_test_auc}") + print(self.srfc_grid.get_params()) + + +class RFCBenchmarkGrid(GridBenchmarks, DataFixture, unittest.TestCase): + """ + Check a grid runs, assert performance (not added yet) + """ + + @classmethod + def setUpClass(cls): + cls._prep_data(cls) + + n_iter = 3 + cls.srfc_grid = RandomizedSearchCV(StreamingRFC(n_jobs=2, + verbose=1), + param_distributions=SRFCGRID, + scoring='roc_auc', + n_iter=n_iter * 2, + verbose=2, + n_jobs=3, + cv=4) + + cls.rfc_grid = RandomizedSearchCV(RandomForestClassifier(n_jobs=2), + param_distributions=RFCGRID, + scoring='roc_auc', + n_iter=n_iter, + verbose=2, + n_jobs=3, + cv=4) + + +class EXTCBenchmarkGrid(GridBenchmarks, DataFixture, unittest.TestCase): + """ + Check a grid runs, assert performance (not added yet) + """ + + @classmethod + def setUpClass(cls): + cls._prep_data(cls) + + n_iter = 2 + cls.srfc_grid = RandomizedSearchCV(StreamingEXTC(n_jobs=2, + verbose=1), + param_distributions=SRFCGRID, + scoring='roc_auc', + n_iter=n_iter * 10, + verbose=2, + n_jobs=3, + cv=4) + + cls.rfc_grid = RandomizedSearchCV(ExtraTreesClassifier(n_jobs=2), + param_distributions=RFCGRID, + scoring='roc_auc', + n_iter=n_iter, + verbose=2, + n_jobs=3, + cv=4) diff --git a/tests/integration/incremental_trees/test_trees_inconsistent_classes.py b/tests/integration/incremental_trees/test_trees_inconsistent_classes.py index 4b77a5d..86dfd35 100644 --- a/tests/integration/incremental_trees/test_trees_inconsistent_classes.py +++ b/tests/integration/incremental_trees/test_trees_inconsistent_classes.py @@ -1,94 +1,95 @@ -import unittest - -import numpy as np -import pandas as pd - -from incremental_trees.trees import StreamingRFC, StreamingEXTC - - -class ClassConsistencyTests: - @classmethod - def setUpClass(cls): - data = pd.DataFrame({'a': (1, 2, 3, 4, 5), - 'b': (1, 2, 3, 4, 5), - 'c': (1, 2, 3, 4, 5), - 'target': (1, 1, 2, 2, 3)}) - - data = pd.concat((data, data), - axis=0).sort_values('target').reset_index(drop=True) - - cls.x = data[[c for c in data if c != 'target']] - cls.y = data['target'] - - def test_none_on_second_call(self): - # Fit with 2 classes - self.mod.partial_fit(self.x[0:6], self.y[0:6], - classes=np.array([1, 2, 3])) - self.mod.predict(self.x[0:6]) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - # Fit with 3 classes - self.mod.partial_fit(self.x, self.y) - self.mod.predict(self.x) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - def test_correct_on_second_call(self): - # Fit with 2 classes - self.mod.partial_fit(self.x[0:6], self.y[0:6], - classes=np.array([1, 2, 3])) - self.mod.predict(self.x[0:6]) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - # Fit with 3 classes - self.mod.partial_fit(self.x, self.y, - classes=np.array([1, 2, 3])) - self.mod.predict(self.x) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - def test_incorrect_on_second_call(self): - """Incorrect on second call - can happen when dask passes classes.""" - - # Fit with 3 classes - self.mod.partial_fit(self.x, self.y, - classes=np.array([1, 2, 3])) - self.mod.predict(self.x) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - # Fit with 2 classes - self.mod.partial_fit(self.x[0:6], self.y[0:6], - classes=np.array([1, 2])) - self.mod.predict(self.x[0:6]) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - self.mod.partial_fit(self.x[5:7], self.y[5:7], - classes=np.array([2, 3])) - self.mod.predict(self.x) - - self.assertEqual(self.mod.n_classes_, 3) - self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) - - -class TestInconsistentClassesRFC(ClassConsistencyTests, unittest.TestCase): - def setUp(self): - self.mod = StreamingRFC(n_estimators_per_chunk=1, - max_n_estimators=np.inf, - verbose=2) - - -class TestInconsistentClassesEXT(ClassConsistencyTests, unittest.TestCase): - def setUp(self): - self.mod = StreamingEXTC(n_estimators_per_chunk=1, - max_n_estimators=np.inf, - verbose=2) +import unittest + +import numpy as np +import pandas as pd + +from incremental_trees.models.classification.streaming_extc import StreamingEXTC +from incremental_trees.trees import StreamingRFC + + +class ClassConsistencyTests: + @classmethod + def setUpClass(cls): + data = pd.DataFrame({'a': (1, 2, 3, 4, 5), + 'b': (1, 2, 3, 4, 5), + 'c': (1, 2, 3, 4, 5), + 'target': (1, 1, 2, 2, 3)}) + + data = pd.concat((data, data), + axis=0).sort_values('target').reset_index(drop=True) + + cls.x = data[[c for c in data if c != 'target']] + cls.y = data['target'] + + def test_none_on_second_call(self): + # Fit with 2 classes + self.mod.partial_fit(self.x[0:6], self.y[0:6], + classes=np.array([1, 2, 3])) + self.mod.predict(self.x[0:6]) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + # Fit with 3 classes + self.mod.partial_fit(self.x, self.y) + self.mod.predict(self.x) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + def test_correct_on_second_call(self): + # Fit with 2 classes + self.mod.partial_fit(self.x[0:6], self.y[0:6], + classes=np.array([1, 2, 3])) + self.mod.predict(self.x[0:6]) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + # Fit with 3 classes + self.mod.partial_fit(self.x, self.y, + classes=np.array([1, 2, 3])) + self.mod.predict(self.x) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + def test_incorrect_on_second_call(self): + """Incorrect on second call - can happen when dask passes classes.""" + + # Fit with 3 classes + self.mod.partial_fit(self.x, self.y, + classes=np.array([1, 2, 3])) + self.mod.predict(self.x) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + # Fit with 2 classes + self.mod.partial_fit(self.x[0:6], self.y[0:6], + classes=np.array([1, 2])) + self.mod.predict(self.x[0:6]) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + self.mod.partial_fit(self.x[5:7], self.y[5:7], + classes=np.array([2, 3])) + self.mod.predict(self.x) + + self.assertEqual(self.mod.n_classes_, 3) + self.assertListEqual(list(self.mod.classes_), [1, 2, 3]) + + +class TestInconsistentClassesRFC(ClassConsistencyTests, unittest.TestCase): + def setUp(self): + self.mod = StreamingRFC(n_estimators_per_chunk=1, + max_n_estimators=np.inf, + verbose=2) + + +class TestInconsistentClassesEXT(ClassConsistencyTests, unittest.TestCase): + def setUp(self): + self.mod = StreamingEXTC(n_estimators_per_chunk=1, + max_n_estimators=np.inf, + verbose=2)