V0.4 (#7)

* Typo * General tidy * Address sonar issues * Address sonar issues * Update requirements * General tidy
garethjns · May 22, 2020 · 064ffa4 · 064ffa4
1 parent d27a70d
commit 064ffa4
Show file tree

Hide file tree

Showing 29 changed files with 895 additions and 835 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Incremental trees v0.3.3
+# Incremental trees v0.4.0
 ![The overcomplicated tests are...](https://github.com/garethjns/IncrementalTrees/workflows/The%20overcomplicated%20tests%20are.../badge.svg)
 
 Adds partial fit method to sklearn's forest estimators (currently RandomForestClassifier/Regressor and ExtraTreesClassifier/Regressor) to allow [incremental training](https://scikit-learn.org/0.15/modules/scaling_strategies.html) without being limited to a linear model. Works with or without [Dask-ml's Incremental](http://ml.dask.org/incremental.html).
@@ -13,10 +13,7 @@ Quick start:
 
 1) Clone repo and build pip installable package.
  ````bash
- git clone https://github.com/garethjns/IncrementalTrees.git
- python -m pip install --upgrade pip setuptools wheel
- cd IncrementalTrees
- pip install .
+ pip install incremental_trees
  ````
 
 
@@ -39,7 +36,7 @@ Feeds .partial_fit() with randomly samples rows.
 ````python
 import numpy as np
 from sklearn.datasets import make_blobs
-from incremental_trees.trees import StreamingRFC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
 
 # Generate some data in memory
 x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
@@ -65,7 +62,7 @@ import dask_ml.datasets
 from dask_ml.wrappers import Incremental
 from dask.distributed import Client, LocalCluster
 from dask import delayed
-from incremental_trees.trees import StreamingRFC
+from incremental_trees.models.classification.streaming_rfc import StreamingRFC
 
 # Generate some data out-of-core
 x, y = dask_ml.datasets.make_blobs(n_samples=2e5, chunks=1e4, random_state=0,
@@ -138,6 +135,8 @@ srfc = StreamingRFC(n_estimators_per_chunk=1,
 ````
 
 # Version history
+## v0.4
+ - Refactor and tidy, try with new versions of Dask/sklearn
 ## v0.3.1-3
  - Update Dask versions
 ## v0.3

diff --git a/incremental_trees/__init__.py b/incremental_trees/__init__.py
@@ -1 +1 @@
-__version__ = '0.3.3'
+__version__ = '0.4.0'
diff --git a/incremental_trees/add_ins/__init__.py b/incremental_trees/add_ins/__init__.py
diff --git a/incremental_trees/add_ins/classifier_additions.py b/incremental_trees/add_ins/classifier_additions.py
@@ -0,0 +1,27 @@
+from typing import List
+
+import numpy as np
+
+from incremental_trees.add_ins.forest_additions import ForestAdditions
+from incremental_trees.add_ins.sklearn_overloads import _check_partial_fit_first_call
+
+
+class ClassifierAdditions(ForestAdditions):
+ """
+ Additional functions specific to classifiers.
+ """
+
+ def _check_classes(self, classes: List[int]):
+ """Set classes if they haven't been set yet, otherwise do nothing."""
+
+ # Set classes for forest (this only needs to be done once).
+ # Not for each individual tree, these will be set by .fit() using the classes available in the subset.
+ # Check classes_ is set, or provided
+ # Returns false if nothing to do
+ classes_need_setting = _check_partial_fit_first_call(self, classes)
+
+ # If classes not set, set
+ # Above will error if not set and classes = None
+ if classes_need_setting:
+ self.classes_ = np.array(classes)
+ self.n_classes_ = len(classes)
diff --git a/incremental_trees/add_ins/classifier_overloads.py b/incremental_trees/add_ins/classifier_overloads.py
@@ -0,0 +1,65 @@
+import warnings
+from typing import Union
+
+import numpy as np
+import pandas as pd
+
+from incremental_trees.add_ins.forest_overloads import ForestOverloads
+
+
+class ClassifierOverloads(ForestOverloads):
+ """
+ Overloaded methods specific to classifiers.
+ """
+
+ def predict_proba(self, x: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
+ """
+ Call each predict proba from tree, and accumulate. This handle possibly inconsistent shapes, but isn't
+ parallel?
+ 
+ Cases where not all classes are presented in the first or subsequent subsets needs to be
+ handled. For the RandomForestClassifier, tree predictions are averaged in
+ sklearn.ensemble.forest.accumulate_prediction function. This sums the output matrix with dimensions
+ n rows x n classes and fails if the class dimension differs.
+ The class dimension is defined at the individual estimator level during the .fit() call, which sets the
+ following attributes:
+ - self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit()
+ to set:
+ - self.classes_
+ - self.n_classes_
+
+ The .predict() method (sklearn.tree.tree.BaseDecisionTree.predict()) sets the output shape using:
+ # Classification
+ if is_classifier(self):
+ if self.n_outputs_ == 1:
+ return self.classes_.take(np.argmax(proba, axis=1), axis=0)
+ else:
+ [Not considering this yet]
+
+ :param x:
+ :return:
+ """
+ # Prepare expected output shape
+ preds = np.zeros(shape=(x.shape[0], self.n_classes_),
+ dtype=np.float32)
+ counts = np.zeros(shape=(x.shape[0], self.n_classes_),
+ dtype=np.int16)
+
+ for e in self.estimators_:
+ # Get the prediction from the tree
+ est_preds = e.predict_proba(x)
+ # Get the indexes of the classes present
+ present_classes = e.classes_.astype(int)
+ # Sum these in to the correct array columns
+ preds[:, present_classes] += est_preds
+ counts[:, present_classes] += 1
+
+ # Normalise predictions against counts
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", RuntimeWarning)
+ norm_prob = preds / counts
+
+ # And remove nans (0/0) and infs (n/0)
+ norm_prob[np.isnan(norm_prob) | np.isinf(norm_prob)] = 0
+
+ return norm_prob
diff --git a/incremental_trees/add_ins/forest_additions.py b/incremental_trees/add_ins/forest_additions.py
@@ -0,0 +1,83 @@
+import time
+from typing import Union
+
+import numpy as np
+import pandas as pd
+
+
+class ForestAdditions:
+ def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series],
+ classes: Union[list, np.ndarray] = None):
+ """
+ Fit a single DTC using the given subset of x and y.
+
+ This calls .fit, which is overloaded. However flags pf_call=True, so .fit() will handle calling super .fit().
+
+ For classifiers;
+ - First call needs to be supplied with the expected classes (similar to existing models with .partial_fit())
+ in case not all classes are present in the first subset.
+
+ This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs
+ depending on the data available in the subset. The predict_proba method is modified to standardise shape to the
+ dimensions defined in this object.
+
+ For regressors:
+ - self._check_classes is overloaded with dummy method.
+
+ :param x:
+ :param y:
+ :return:
+ """
+ if self.verbose > 1:
+ print(f"PF Call with set classes: "
+ f"{getattr(self, 'classes_', '[no classes attr]')} and input classes {classes}")
+
+ self._check_classes(classes=classes)
+
+ # Fit the next estimator, if not done
+ if self._fit_estimators < self.max_n_estimators:
+ t0 = time.time()
+ self.fit(X, y,
+ pf_call=True,
+ classes_=getattr(self, 'classes_', None)) # Pass classes for enforcement, if classifier.
+ t1 = time.time()
+
+ if self.verbose > 1:
+ print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} "
+ f"/ {self.max_n_estimators}")
+ print(f"Model reports {len(self.estimators_)}")
+ print(f"Fit time: {round(t1 - t0, 2)}")
+ print(len(self.estimators_))
+ self._fit_estimators += self.n_estimators_per_chunk
+
+ # If still not done, prep to fit next
+ if self._fit_estimators < self.max_n_estimators:
+ self.n_estimators += self.n_estimators_per_chunk
+
+ else:
+ if self.verbose > 0:
+ print('Done')
+
+ return self
+
+ def _sampled_partial_fit(self,
+ x: Union[np.array, pd.DataFrame], y: [np.ndarray, pd.Series]):
+ """
+ This feeds partial_fit with random samples based on the spf_ parameters. Used by .fit() when not using dask.
+ :param x: Data.
+ :param y: Labels.
+ :return:
+ """
+
+ n_samples = int(self.spf_sample_prop * x.shape[0])
+
+ for _ in range(self.spf_n_fits):
+ idx = np.random.randint(0, x.shape[0], n_samples)
+
+ if self.verbose > 0:
+ print(f"_sampled_partial_fit size: {idx.shape}")
+
+ self.partial_fit(x[idx, :], y[idx],
+ classes=np.unique(y))
+
+ return self
diff --git a/incremental_trees/add_ins/forest_overloads.py b/incremental_trees/add_ins/forest_overloads.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+
+class ForestOverloads:
+ def set_params(self,
+ **kwargs):
+ """
+ Ensure warm_Start is set to true, otherwise set other params as usual.
+
+ :param kwargs: Params to set.
+ """
+ # Warm start should be true to get .fit() to keep existing estimators.
+ kwargs['warm_start'] = True
+
+ for key, value in kwargs.items():
+ setattr(self, key, value)
+
+ return self
+
+ def fit(self, *args,
+ pf_call: bool = False,
+ classes_: np.ndarray = None):
+ """
+ This fit handles calling either super().fit or partial_fit depending on the caller.
+
+ :param pf_call: True if called from partial fit, in this case super.fit() is called, instead of getting stuck in
+ a recursive loop.
+ :param classes_: On pf calls, classes is passed from self.classes which will have already been set. These are
+ re-set after the call to super's fit, which will change them based on observed data.
+ """
+
+ if not self.dask_feeding and not pf_call:
+ if self.verbose > 0:
+ print('Feeding with spf')
+ self._sampled_partial_fit(*args)
+
+ else:
+
+ if self.verbose > 0:
+ print('Fitting from a partial_fit call')
+ super().fit(*args)
+ if classes_ is not None:
+ self.classes_ = classes_
+ self.n_classes_ = len(classes_)
+
+ return self
diff --git a/incremental_trees/add_ins/regressor_additions.py b/incremental_trees/add_ins/regressor_additions.py
@@ -0,0 +1,9 @@
+from incremental_trees.add_ins.forest_additions import ForestAdditions
+
+
+class RegressorAdditions(ForestAdditions):
+ def _check_classes(self, **kwargs) -> None:
+ """
+ Don't need to check classes with the regressor.
+ """
+ pass
diff --git a/incremental_trees/add_ins/regressor_overloads.py b/incremental_trees/add_ins/regressor_overloads.py
@@ -0,0 +1,8 @@
+from incremental_trees.add_ins.forest_overloads import ForestOverloads
+
+
+class RegressorOverloads(ForestOverloads):
+ """
+ Nothing specific to overload for the Regressors. Predict doesn't need to deal with classes.
+ """
+ pass
diff --git a/incremental_trees/add_ins/sklearn_overloads.py b/incremental_trees/add_ins/sklearn_overloads.py
@@ -0,0 +1,46 @@
+import warnings
+
+import numpy as np
+from sklearn.utils.multiclass import unique_labels
+
+
+def _check_partial_fit_first_call(clf,
+ classes=None):
+ """
+ Modified sklearn function. If classes are inconsistent on second call, warn and reuse previous
+ on assumption first call specification was correct. Don't raise error.
+
+ Private helper function for factorizing common classes param logic
+
+ Estimators that implement the ``partial_fit`` API need to be provided with
+ the list of possible classes at the first call to partial_fit.
+
+ Modification:
+ Subsequent calls to partial_fit do not check () that ``classes`` is still
+ consistent with a previous value of ``clf.classes_`` when provided.
+
+ This function returns True if it detects that this was the first call to
+ ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
+ set on ``clf``.
+ """
+
+ if getattr(clf, 'classes_', None) is None and classes is None:
+ raise ValueError("classes must be passed on the first call "
+ "to partial_fit.")
+
+ elif classes is not None:
+ if getattr(clf, 'classes_', None) is not None:
+ if not np.array_equal(clf.classes_, unique_labels(classes)):
+ # Don't error here:
+ # Instead, use the previous classes setting, which must be correct on first setting
+ warnings.warn(f"Classes differ on this call, ignoring on the assumption first call was correct.")
+ return False
+
+ else:
+ # This is the first call to partial_fit
+ clf.classes_ = unique_labels(classes)
+ return True
+
+ # classes is None and clf.classes_ has already previously been set:
+ # nothing to do
+ return False
diff --git a/incremental_trees/models/__init__.py b/incremental_trees/models/__init__.py
diff --git a/incremental_trees/models/classification/__init__.py b/incremental_trees/models/classification/__init__.py