-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Typo * General tidy * Address sonar issues * Address sonar issues * Update requirements * General tidy
- Loading branch information
Showing
29 changed files
with
895 additions
and
835 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = '0.3.3' | ||
__version__ = '0.4.0' |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from typing import List | ||
|
||
import numpy as np | ||
|
||
from incremental_trees.add_ins.forest_additions import ForestAdditions | ||
from incremental_trees.add_ins.sklearn_overloads import _check_partial_fit_first_call | ||
|
||
|
||
class ClassifierAdditions(ForestAdditions): | ||
""" | ||
Additional functions specific to classifiers. | ||
""" | ||
|
||
def _check_classes(self, classes: List[int]): | ||
"""Set classes if they haven't been set yet, otherwise do nothing.""" | ||
|
||
# Set classes for forest (this only needs to be done once). | ||
# Not for each individual tree, these will be set by .fit() using the classes available in the subset. | ||
# Check classes_ is set, or provided | ||
# Returns false if nothing to do | ||
classes_need_setting = _check_partial_fit_first_call(self, classes) | ||
|
||
# If classes not set, set | ||
# Above will error if not set and classes = None | ||
if classes_need_setting: | ||
self.classes_ = np.array(classes) | ||
self.n_classes_ = len(classes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import warnings | ||
from typing import Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from incremental_trees.add_ins.forest_overloads import ForestOverloads | ||
|
||
|
||
class ClassifierOverloads(ForestOverloads): | ||
""" | ||
Overloaded methods specific to classifiers. | ||
""" | ||
|
||
def predict_proba(self, x: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: | ||
""" | ||
Call each predict proba from tree, and accumulate. This handle possibly inconsistent shapes, but isn't | ||
parallel? | ||
| ||
Cases where not all classes are presented in the first or subsequent subsets needs to be | ||
handled. For the RandomForestClassifier, tree predictions are averaged in | ||
sklearn.ensemble.forest.accumulate_prediction function. This sums the output matrix with dimensions | ||
n rows x n classes and fails if the class dimension differs. | ||
The class dimension is defined at the individual estimator level during the .fit() call, which sets the | ||
following attributes: | ||
- self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit() | ||
to set: | ||
- self.classes_ | ||
- self.n_classes_ | ||
The .predict() method (sklearn.tree.tree.BaseDecisionTree.predict()) sets the output shape using: | ||
# Classification | ||
if is_classifier(self): | ||
if self.n_outputs_ == 1: | ||
return self.classes_.take(np.argmax(proba, axis=1), axis=0) | ||
else: | ||
[Not considering this yet] | ||
:param x: | ||
:return: | ||
""" | ||
# Prepare expected output shape | ||
preds = np.zeros(shape=(x.shape[0], self.n_classes_), | ||
dtype=np.float32) | ||
counts = np.zeros(shape=(x.shape[0], self.n_classes_), | ||
dtype=np.int16) | ||
|
||
for e in self.estimators_: | ||
# Get the prediction from the tree | ||
est_preds = e.predict_proba(x) | ||
# Get the indexes of the classes present | ||
present_classes = e.classes_.astype(int) | ||
# Sum these in to the correct array columns | ||
preds[:, present_classes] += est_preds | ||
counts[:, present_classes] += 1 | ||
|
||
# Normalise predictions against counts | ||
with warnings.catch_warnings(): | ||
warnings.simplefilter("ignore", RuntimeWarning) | ||
norm_prob = preds / counts | ||
|
||
# And remove nans (0/0) and infs (n/0) | ||
norm_prob[np.isnan(norm_prob) | np.isinf(norm_prob)] = 0 | ||
|
||
return norm_prob |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import time | ||
from typing import Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
class ForestAdditions: | ||
def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series], | ||
classes: Union[list, np.ndarray] = None): | ||
""" | ||
Fit a single DTC using the given subset of x and y. | ||
| ||
This calls .fit, which is overloaded. However flags pf_call=True, so .fit() will handle calling super .fit(). | ||
| ||
For classifiers; | ||
- First call needs to be supplied with the expected classes (similar to existing models with .partial_fit()) | ||
in case not all classes are present in the first subset. | ||
This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs | ||
depending on the data available in the subset. The predict_proba method is modified to standardise shape to the | ||
dimensions defined in this object. | ||
For regressors: | ||
- self._check_classes is overloaded with dummy method. | ||
| ||
:param x: | ||
:param y: | ||
:return: | ||
""" | ||
if self.verbose > 1: | ||
print(f"PF Call with set classes: " | ||
f"{getattr(self, 'classes_', '[no classes attr]')} and input classes {classes}") | ||
|
||
self._check_classes(classes=classes) | ||
|
||
# Fit the next estimator, if not done | ||
if self._fit_estimators < self.max_n_estimators: | ||
t0 = time.time() | ||
self.fit(X, y, | ||
pf_call=True, | ||
classes_=getattr(self, 'classes_', None)) # Pass classes for enforcement, if classifier. | ||
t1 = time.time() | ||
|
||
if self.verbose > 1: | ||
print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} " | ||
f"/ {self.max_n_estimators}") | ||
print(f"Model reports {len(self.estimators_)}") | ||
print(f"Fit time: {round(t1 - t0, 2)}") | ||
print(len(self.estimators_)) | ||
self._fit_estimators += self.n_estimators_per_chunk | ||
|
||
# If still not done, prep to fit next | ||
if self._fit_estimators < self.max_n_estimators: | ||
self.n_estimators += self.n_estimators_per_chunk | ||
|
||
else: | ||
if self.verbose > 0: | ||
print('Done') | ||
|
||
return self | ||
|
||
def _sampled_partial_fit(self, | ||
x: Union[np.array, pd.DataFrame], y: [np.ndarray, pd.Series]): | ||
""" | ||
This feeds partial_fit with random samples based on the spf_ parameters. Used by .fit() when not using dask. | ||
:param x: Data. | ||
:param y: Labels. | ||
:return: | ||
""" | ||
|
||
n_samples = int(self.spf_sample_prop * x.shape[0]) | ||
|
||
for _ in range(self.spf_n_fits): | ||
idx = np.random.randint(0, x.shape[0], n_samples) | ||
|
||
if self.verbose > 0: | ||
print(f"_sampled_partial_fit size: {idx.shape}") | ||
|
||
self.partial_fit(x[idx, :], y[idx], | ||
classes=np.unique(y)) | ||
|
||
return self |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import numpy as np | ||
|
||
|
||
class ForestOverloads: | ||
def set_params(self, | ||
**kwargs): | ||
""" | ||
Ensure warm_Start is set to true, otherwise set other params as usual. | ||
:param kwargs: Params to set. | ||
""" | ||
# Warm start should be true to get .fit() to keep existing estimators. | ||
kwargs['warm_start'] = True | ||
|
||
for key, value in kwargs.items(): | ||
setattr(self, key, value) | ||
|
||
return self | ||
|
||
def fit(self, *args, | ||
pf_call: bool = False, | ||
classes_: np.ndarray = None): | ||
""" | ||
This fit handles calling either super().fit or partial_fit depending on the caller. | ||
:param pf_call: True if called from partial fit, in this case super.fit() is called, instead of getting stuck in | ||
a recursive loop. | ||
:param classes_: On pf calls, classes is passed from self.classes which will have already been set. These are | ||
re-set after the call to super's fit, which will change them based on observed data. | ||
""" | ||
|
||
if not self.dask_feeding and not pf_call: | ||
if self.verbose > 0: | ||
print('Feeding with spf') | ||
self._sampled_partial_fit(*args) | ||
|
||
else: | ||
|
||
if self.verbose > 0: | ||
print('Fitting from a partial_fit call') | ||
super().fit(*args) | ||
if classes_ is not None: | ||
self.classes_ = classes_ | ||
self.n_classes_ = len(classes_) | ||
|
||
return self |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from incremental_trees.add_ins.forest_additions import ForestAdditions | ||
|
||
|
||
class RegressorAdditions(ForestAdditions): | ||
def _check_classes(self, **kwargs) -> None: | ||
""" | ||
Don't need to check classes with the regressor. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from incremental_trees.add_ins.forest_overloads import ForestOverloads | ||
|
||
|
||
class RegressorOverloads(ForestOverloads): | ||
""" | ||
Nothing specific to overload for the Regressors. Predict doesn't need to deal with classes. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import warnings | ||
|
||
import numpy as np | ||
from sklearn.utils.multiclass import unique_labels | ||
|
||
|
||
def _check_partial_fit_first_call(clf, | ||
classes=None): | ||
""" | ||
Modified sklearn function. If classes are inconsistent on second call, warn and reuse previous | ||
on assumption first call specification was correct. Don't raise error. | ||
Private helper function for factorizing common classes param logic | ||
Estimators that implement the ``partial_fit`` API need to be provided with | ||
the list of possible classes at the first call to partial_fit. | ||
Modification: | ||
Subsequent calls to partial_fit do not check () that ``classes`` is still | ||
consistent with a previous value of ``clf.classes_`` when provided. | ||
This function returns True if it detects that this was the first call to | ||
``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also | ||
set on ``clf``. | ||
""" | ||
|
||
if getattr(clf, 'classes_', None) is None and classes is None: | ||
raise ValueError("classes must be passed on the first call " | ||
"to partial_fit.") | ||
|
||
elif classes is not None: | ||
if getattr(clf, 'classes_', None) is not None: | ||
if not np.array_equal(clf.classes_, unique_labels(classes)): | ||
# Don't error here: | ||
# Instead, use the previous classes setting, which must be correct on first setting | ||
warnings.warn(f"Classes differ on this call, ignoring on the assumption first call was correct.") | ||
return False | ||
|
||
else: | ||
# This is the first call to partial_fit | ||
clf.classes_ = unique_labels(classes) | ||
return True | ||
|
||
# classes is None and clf.classes_ has already previously been set: | ||
# nothing to do | ||
return False |
Empty file.
Empty file.
Oops, something went wrong.