Skip to content

Commit

Permalink
V0.4 (#7)
Browse files Browse the repository at this point in the history
* Typo

* General tidy

* Address sonar issues

* Address sonar issues

* Update requirements

* General tidy
  • Loading branch information
garethjns authored May 22, 2020
1 parent d27a70d commit 064ffa4
Show file tree
Hide file tree
Showing 29 changed files with 895 additions and 835 deletions.
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Incremental trees v0.3.3
# Incremental trees v0.4.0
![The overcomplicated tests are...](https://github.com/garethjns/IncrementalTrees/workflows/The%20overcomplicated%20tests%20are.../badge.svg)

Adds partial fit method to sklearn's forest estimators (currently RandomForestClassifier/Regressor and ExtraTreesClassifier/Regressor) to allow [incremental training](https://scikit-learn.org/0.15/modules/scaling_strategies.html) without being limited to a linear model. Works with or without [Dask-ml's Incremental](http://ml.dask.org/incremental.html).
Expand All @@ -13,10 +13,7 @@ Quick start:

1) Clone repo and build pip installable package.
````bash
git clone https://github.com/garethjns/IncrementalTrees.git
python -m pip install --upgrade pip setuptools wheel
cd IncrementalTrees
pip install .
pip install incremental_trees
````


Expand All @@ -39,7 +36,7 @@ Feeds .partial_fit() with randomly samples rows.
````python
import numpy as np
from sklearn.datasets import make_blobs
from incremental_trees.trees import StreamingRFC
from incremental_trees.models.classification.streaming_rfc import StreamingRFC

# Generate some data in memory
x, y = make_blobs(n_samples=int(2e5), random_state=0, n_features=40,
Expand All @@ -65,7 +62,7 @@ import dask_ml.datasets
from dask_ml.wrappers import Incremental
from dask.distributed import Client, LocalCluster
from dask import delayed
from incremental_trees.trees import StreamingRFC
from incremental_trees.models.classification.streaming_rfc import StreamingRFC

# Generate some data out-of-core
x, y = dask_ml.datasets.make_blobs(n_samples=2e5, chunks=1e4, random_state=0,
Expand Down Expand Up @@ -138,6 +135,8 @@ srfc = StreamingRFC(n_estimators_per_chunk=1,
````

# Version history
## v0.4
- Refactor and tidy, try with new versions of Dask/sklearn
## v0.3.1-3
- Update Dask versions
## v0.3
Expand Down
2 changes: 1 addition & 1 deletion incremental_trees/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.3'
__version__ = '0.4.0'
Empty file.
27 changes: 27 additions & 0 deletions incremental_trees/add_ins/classifier_additions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import List

import numpy as np

from incremental_trees.add_ins.forest_additions import ForestAdditions
from incremental_trees.add_ins.sklearn_overloads import _check_partial_fit_first_call


class ClassifierAdditions(ForestAdditions):
"""
Additional functions specific to classifiers.
"""

def _check_classes(self, classes: List[int]):
"""Set classes if they haven't been set yet, otherwise do nothing."""

# Set classes for forest (this only needs to be done once).
# Not for each individual tree, these will be set by .fit() using the classes available in the subset.
# Check classes_ is set, or provided
# Returns false if nothing to do
classes_need_setting = _check_partial_fit_first_call(self, classes)

# If classes not set, set
# Above will error if not set and classes = None
if classes_need_setting:
self.classes_ = np.array(classes)
self.n_classes_ = len(classes)
65 changes: 65 additions & 0 deletions incremental_trees/add_ins/classifier_overloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import warnings
from typing import Union

import numpy as np
import pandas as pd

from incremental_trees.add_ins.forest_overloads import ForestOverloads


class ClassifierOverloads(ForestOverloads):
"""
Overloaded methods specific to classifiers.
"""

def predict_proba(self, x: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
"""
Call each predict proba from tree, and accumulate. This handle possibly inconsistent shapes, but isn't
parallel?
Cases where not all classes are presented in the first or subsequent subsets needs to be
handled. For the RandomForestClassifier, tree predictions are averaged in
sklearn.ensemble.forest.accumulate_prediction function. This sums the output matrix with dimensions
n rows x n classes and fails if the class dimension differs.
The class dimension is defined at the individual estimator level during the .fit() call, which sets the
following attributes:
- self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit()
to set:
- self.classes_
- self.n_classes_
The .predict() method (sklearn.tree.tree.BaseDecisionTree.predict()) sets the output shape using:
# Classification
if is_classifier(self):
if self.n_outputs_ == 1:
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
else:
[Not considering this yet]
:param x:
:return:
"""
# Prepare expected output shape
preds = np.zeros(shape=(x.shape[0], self.n_classes_),
dtype=np.float32)
counts = np.zeros(shape=(x.shape[0], self.n_classes_),
dtype=np.int16)

for e in self.estimators_:
# Get the prediction from the tree
est_preds = e.predict_proba(x)
# Get the indexes of the classes present
present_classes = e.classes_.astype(int)
# Sum these in to the correct array columns
preds[:, present_classes] += est_preds
counts[:, present_classes] += 1

# Normalise predictions against counts
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
norm_prob = preds / counts

# And remove nans (0/0) and infs (n/0)
norm_prob[np.isnan(norm_prob) | np.isinf(norm_prob)] = 0

return norm_prob
83 changes: 83 additions & 0 deletions incremental_trees/add_ins/forest_additions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import time
from typing import Union

import numpy as np
import pandas as pd


class ForestAdditions:
def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series],
classes: Union[list, np.ndarray] = None):
"""
Fit a single DTC using the given subset of x and y.
This calls .fit, which is overloaded. However flags pf_call=True, so .fit() will handle calling super .fit().
For classifiers;
- First call needs to be supplied with the expected classes (similar to existing models with .partial_fit())
in case not all classes are present in the first subset.
This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs
depending on the data available in the subset. The predict_proba method is modified to standardise shape to the
dimensions defined in this object.
For regressors:
- self._check_classes is overloaded with dummy method.
:param x:
:param y:
:return:
"""
if self.verbose > 1:
print(f"PF Call with set classes: "
f"{getattr(self, 'classes_', '[no classes attr]')} and input classes {classes}")

self._check_classes(classes=classes)

# Fit the next estimator, if not done
if self._fit_estimators < self.max_n_estimators:
t0 = time.time()
self.fit(X, y,
pf_call=True,
classes_=getattr(self, 'classes_', None)) # Pass classes for enforcement, if classifier.
t1 = time.time()

if self.verbose > 1:
print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} "
f"/ {self.max_n_estimators}")
print(f"Model reports {len(self.estimators_)}")
print(f"Fit time: {round(t1 - t0, 2)}")
print(len(self.estimators_))
self._fit_estimators += self.n_estimators_per_chunk

# If still not done, prep to fit next
if self._fit_estimators < self.max_n_estimators:
self.n_estimators += self.n_estimators_per_chunk

else:
if self.verbose > 0:
print('Done')

return self

def _sampled_partial_fit(self,
x: Union[np.array, pd.DataFrame], y: [np.ndarray, pd.Series]):
"""
This feeds partial_fit with random samples based on the spf_ parameters. Used by .fit() when not using dask.
:param x: Data.
:param y: Labels.
:return:
"""

n_samples = int(self.spf_sample_prop * x.shape[0])

for _ in range(self.spf_n_fits):
idx = np.random.randint(0, x.shape[0], n_samples)

if self.verbose > 0:
print(f"_sampled_partial_fit size: {idx.shape}")

self.partial_fit(x[idx, :], y[idx],
classes=np.unique(y))

return self
46 changes: 46 additions & 0 deletions incremental_trees/add_ins/forest_overloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import numpy as np


class ForestOverloads:
def set_params(self,
**kwargs):
"""
Ensure warm_Start is set to true, otherwise set other params as usual.
:param kwargs: Params to set.
"""
# Warm start should be true to get .fit() to keep existing estimators.
kwargs['warm_start'] = True

for key, value in kwargs.items():
setattr(self, key, value)

return self

def fit(self, *args,
pf_call: bool = False,
classes_: np.ndarray = None):
"""
This fit handles calling either super().fit or partial_fit depending on the caller.
:param pf_call: True if called from partial fit, in this case super.fit() is called, instead of getting stuck in
a recursive loop.
:param classes_: On pf calls, classes is passed from self.classes which will have already been set. These are
re-set after the call to super's fit, which will change them based on observed data.
"""

if not self.dask_feeding and not pf_call:
if self.verbose > 0:
print('Feeding with spf')
self._sampled_partial_fit(*args)

else:

if self.verbose > 0:
print('Fitting from a partial_fit call')
super().fit(*args)
if classes_ is not None:
self.classes_ = classes_
self.n_classes_ = len(classes_)

return self
9 changes: 9 additions & 0 deletions incremental_trees/add_ins/regressor_additions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from incremental_trees.add_ins.forest_additions import ForestAdditions


class RegressorAdditions(ForestAdditions):
def _check_classes(self, **kwargs) -> None:
"""
Don't need to check classes with the regressor.
"""
pass
8 changes: 8 additions & 0 deletions incremental_trees/add_ins/regressor_overloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from incremental_trees.add_ins.forest_overloads import ForestOverloads


class RegressorOverloads(ForestOverloads):
"""
Nothing specific to overload for the Regressors. Predict doesn't need to deal with classes.
"""
pass
46 changes: 46 additions & 0 deletions incremental_trees/add_ins/sklearn_overloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import warnings

import numpy as np
from sklearn.utils.multiclass import unique_labels


def _check_partial_fit_first_call(clf,
classes=None):
"""
Modified sklearn function. If classes are inconsistent on second call, warn and reuse previous
on assumption first call specification was correct. Don't raise error.
Private helper function for factorizing common classes param logic
Estimators that implement the ``partial_fit`` API need to be provided with
the list of possible classes at the first call to partial_fit.
Modification:
Subsequent calls to partial_fit do not check () that ``classes`` is still
consistent with a previous value of ``clf.classes_`` when provided.
This function returns True if it detects that this was the first call to
``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
set on ``clf``.
"""

if getattr(clf, 'classes_', None) is None and classes is None:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")

elif classes is not None:
if getattr(clf, 'classes_', None) is not None:
if not np.array_equal(clf.classes_, unique_labels(classes)):
# Don't error here:
# Instead, use the previous classes setting, which must be correct on first setting
warnings.warn(f"Classes differ on this call, ignoring on the assumption first call was correct.")
return False

else:
# This is the first call to partial_fit
clf.classes_ = unique_labels(classes)
return True

# classes is None and clf.classes_ has already previously been set:
# nothing to do
return False
Empty file.
Empty file.
Loading

0 comments on commit 064ffa4

Please sign in to comment.