Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failed attempt at ordered gradients/hessians #87

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/bench_higgs_boson.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def load_data():
max_leaf_nodes=n_leaf_nodes,
n_iter_no_change=None,
random_state=0,
verbose=1)
verbose=1, parallel_splitting=False)
pygbm_model.fit(data_train, target_train)
toc = time()
predicted_test = pygbm_model.predict(data_test)
Expand Down
30 changes: 18 additions & 12 deletions pygbm/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class BaseGradientBoostingMachine(BaseEstimator, ABC):
def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
max_depth, min_samples_leaf, l2_regularization, max_bins,
scoring, validation_split, n_iter_no_change, tol, verbose,
random_state):
random_state, parallel_splitting):
self.loss = loss
self.learning_rate = learning_rate
self.max_iter = max_iter
Expand All @@ -41,6 +41,7 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
self.tol = tol
self.verbose = verbose
self.random_state = random_state
self.parallel_splitting = parallel_splitting

def _validate_parameters(self):
"""Validate parameters passed to __init__.
Expand Down Expand Up @@ -148,11 +149,14 @@ def fit(self, X, y):
# Subsample the training set for score-based monitoring.
if do_early_stopping:
subsample_size = 10000
indices = np.arange(X_binned_train.shape[0])
if X_binned_train.shape[0] > subsample_size:
indices = rng.choice(indices, subsample_size)
X_binned_small_train = X_binned_train[indices]
y_small_train = y_train[indices]
n_samples_train = X_binned_train.shape[0]
if n_samples_train > subsample_size:
indices = rng.choice(X_binned_train.shape[0], subsample_size)
X_binned_small_train = X_binned_train[indices]
y_small_train = y_train[indices]
else:
X_binned_small_train = X_binned_train
y_small_train = y_train
# Predicting is faster of C-contiguous arrays.
X_binned_small_train = np.ascontiguousarray(X_binned_small_train)

Expand Down Expand Up @@ -218,14 +222,15 @@ def fit(self, X, y):
# whole array.

grower = TreeGrower(
X_binned_train, gradients_at_k, hessians_at_k,
X_binned_train, gradients_at_k.copy(), hessians_at_k,
max_bins=self.max_bins,
n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_,
max_leaf_nodes=self.max_leaf_nodes,
max_depth=self.max_depth,
min_samples_leaf=self.min_samples_leaf,
l2_regularization=self.l2_regularization,
shrinkage=self.learning_rate)
shrinkage=self.learning_rate,
parallel_splitting=self.parallel_splitting)
grower.grow()

acc_apply_split_time += grower.total_apply_split_time
Expand Down Expand Up @@ -492,15 +497,16 @@ def __init__(self, loss='least_squares', learning_rate=0.1,
max_iter=100, max_leaf_nodes=31, max_depth=None,
min_samples_leaf=20, l2_regularization=0., max_bins=256,
scoring=None, validation_split=0.1, n_iter_no_change=5,
tol=1e-7, verbose=0, random_state=None):
tol=1e-7, verbose=0, random_state=None,
parallel_splitting=True):
super(GradientBoostingRegressor, self).__init__(
loss=loss, learning_rate=learning_rate, max_iter=max_iter,
max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
l2_regularization=l2_regularization, max_bins=max_bins,
scoring=scoring, validation_split=validation_split,
n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
random_state=random_state)
random_state=random_state, parallel_splitting=parallel_splitting)

def predict(self, X):
"""Predict values for X.
Expand Down Expand Up @@ -611,15 +617,15 @@ def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
l2_regularization=0., max_bins=256, scoring=None,
validation_split=0.1, n_iter_no_change=5, tol=1e-7,
verbose=0, random_state=None):
verbose=0, random_state=None, parallel_splitting=True):
super(GradientBoostingClassifier, self).__init__(
loss=loss, learning_rate=learning_rate, max_iter=max_iter,
max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
l2_regularization=l2_regularization, max_bins=max_bins,
scoring=scoring, validation_split=validation_split,
n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
random_state=random_state)
random_state=random_state, parallel_splitting=parallel_splitting)

def predict(self, X):
"""Predict classes for X.
Expand Down
38 changes: 26 additions & 12 deletions pygbm/grower.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import numpy as np
from time import time

from .splitting import (SplittingContext, split_indices, find_node_split,
from .splitting import (SplittingContext, split_indices_parallel,
split_indices_single_thread, find_node_split,
find_node_split_subtraction)
from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE

Expand Down Expand Up @@ -77,10 +78,12 @@ class TreeNode:
apply_split_time = 0.
hist_subtraction = False

def __init__(self, depth, sample_indices, sum_gradients,
def __init__(self, depth, sample_indices, gradients, hessians, sum_gradients,
sum_hessians, parent=None):
self.depth = depth
self.sample_indices = sample_indices
self.gradients = gradients
self.hessians = hessians
self.n_samples = sample_indices.shape[0]
self.sum_gradients = sum_gradients
self.sum_hessians = sum_hessians
Expand Down Expand Up @@ -163,7 +166,8 @@ class TreeGrower:
def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
max_bins=256, n_bins_per_feature=None, l2_regularization=0.,
min_hessian_to_split=1e-3, shrinkage=1.):
min_hessian_to_split=1e-3, shrinkage=1.,
parallel_splitting=True):

self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
Expand All @@ -180,13 +184,14 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
self.splitting_context = SplittingContext(
X_binned, max_bins, n_bins_per_feature, gradients,
hessians, l2_regularization, min_hessian_to_split,
min_samples_leaf, min_gain_to_split)
min_samples_leaf, min_gain_to_split, parallel_splitting)
self.max_leaf_nodes = max_leaf_nodes
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.X_binned = X_binned
self.min_gain_to_split = min_gain_to_split
self.shrinkage = shrinkage
self.parallel_splitting = parallel_splitting
self.splittable_nodes = []
self.finalized_leaves = []
self.total_find_split_time = 0. # time spent finding the best splits
Expand Down Expand Up @@ -238,14 +243,16 @@ def _intilialize_root(self):
n_samples = self.X_binned.shape[0]
depth = 0
if self.splitting_context.constant_hessian:
hessian = self.splitting_context.hessians[0] * n_samples
sum_hessian = self.splitting_context.ordered_hessians[0] * n_samples
else:
hessian = self.splitting_context.hessians.sum()
sum_hessian = self.splitting_context.ordered_hessians.sum()
self.root = TreeNode(
depth=depth,
sample_indices=self.splitting_context.partition.view(),
sum_gradients=self.splitting_context.gradients.sum(),
sum_hessians=hessian
gradients=self.splitting_context.ordered_gradients,
hessians=self.splitting_context.ordered_hessians,
sample_indices=self.splitting_context.partition,#.view(),
sum_gradients=self.splitting_context.ordered_gradients.sum(),
sum_hessians=sum_hessian
)
if (self.max_leaf_nodes is not None and self.max_leaf_nodes == 1):
self._finalize_leaf(self.root)
Expand Down Expand Up @@ -296,7 +303,8 @@ def _compute_spittability(self, node, only_hist=False):
node.parent.histograms, node.sibling.histograms)
else:
split_info, histograms = find_node_split(
self.splitting_context, node.sample_indices)
self.splitting_context, node.sample_indices, node.gradients,
node.hessians)
toc = time()
node.find_split_time = toc - tic
self.total_find_split_time += node.find_split_time
Expand Down Expand Up @@ -336,8 +344,10 @@ def split_next(self):
node = heappop(self.splittable_nodes)

tic = time()
(sample_indices_left, sample_indices_right) = split_indices(
self.splitting_context, node.split_info, node.sample_indices)
split_indices = split_indices_parallel if self.parallel_splitting else split_indices_single_thread
(sample_indices_left, gradients_left, hessians_left), \
(sample_indices_right, gradients_right, hessians_right) = split_indices(
self.splitting_context, node.split_info, node.sample_indices, node.gradients, node.hessians)
toc = time()
node.apply_split_time = toc - tic
self.total_apply_split_time += node.apply_split_time
Expand All @@ -348,11 +358,15 @@ def split_next(self):

left_child_node = TreeNode(depth,
sample_indices_left,
gradients_left,
hessians_left,
node.split_info.gradient_left,
node.split_info.hessian_left,
parent=node)
right_child_node = TreeNode(depth,
sample_indices_right,
gradients_right,
hessians_right,
node.split_info.gradient_right,
node.split_info.hessian_right,
parent=node)
Expand Down
Loading