Skip to content

Commit

Permalink
Support parallel and add random search (#167)
Browse files Browse the repository at this point in the history
* non hashable value out of signature

* parallel trials

* add random in _search_parallel

* fix bug in retraining

* check memory constraint before training

* retrain_full

* log custom metric

* retraining budget check

* sample size check before retrain

* remove 'time2eval' from result

* report 'total_search_time' in result

* rename total_search_time to wall_clock_time

* rename train_loss boolean to log_training_metric

* set default train_loss to None

* exclude oom result

* log retrained model

* no subsample

* doc str

* notebook

* predicted value is NaN for sarimax

* version

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
Co-authored-by: Qingyun Wu <qxw5138@psu.edu>
  • Loading branch information
3 people authored Aug 23, 2021
1 parent 3d0a3d2 commit a229a61
Show file tree
Hide file tree
Showing 21 changed files with 5,012 additions and 4,547 deletions.
3,466 changes: 1,809 additions & 1,657 deletions flaml/automl.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions flaml/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,14 @@ def get_output_from_log(filename, time_budget):
best_config_list = []
with training_log_reader(filename) as reader:
for record in reader.records():
time_used = record.total_search_time
time_used = record.wall_clock_time
val_loss = record.validation_loss
config = record.config
learner = record.learner.split('_')[0]
sample_size = record.sample_size
train_loss = record.logged_metric

if time_used < time_budget:
if time_used < time_budget and np.isfinite(val_loss):
if val_loss < best_val_loss:
best_val_loss = val_loss
best_config = config
Expand Down
48 changes: 27 additions & 21 deletions flaml/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,11 @@ def sklearn_metric_loss_score(
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'mape' in metric_name:
score = mean_absolute_percentage_error(
y_true, y_predict)
try:
score = mean_absolute_percentage_error(
y_true, y_predict)
except ValueError:
return np.inf
elif 'micro_f1' in metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='micro')
Expand Down Expand Up @@ -141,21 +144,23 @@ def get_y_pred(estimator, X, eval_metric, obj, freq=None):

def get_test_loss(
estimator, X_train, y_train, X_test, y_test, weight_test,
eval_metric, obj, labels=None, budget=None, train_loss=False, fit_kwargs={}
eval_metric, obj, labels=None, budget=None, log_training_metric=False, fit_kwargs={}
):
start = time.time()
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
estimator.fit(X_train, y_train, budget, **fit_kwargs)
if isinstance(eval_metric, str):
pred_start = time.time()
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
pred_time = (time.time() - pred_start) / X_test.shape[0]
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels, weight_test)
if train_loss is not False:
if log_training_metric:
test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
train_loss = sklearn_metric_loss_score(
eval_metric, test_pred_y,
y_train, labels, fit_kwargs.get('sample_weight'))
else:
train_loss = None
else: # customized metric function
test_loss, metrics = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train,
Expand All @@ -174,40 +179,41 @@ def train_model(estimator, X_train, y_train, budget, fit_kwargs={}):

def evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val,
budget, kf, task, eval_method, eval_metric, best_val_loss, train_loss=False,
budget, kf, task, eval_method, eval_metric, best_val_loss, log_training_metric=False,
fit_kwargs={}
):
if 'holdout' in eval_method:
val_loss, train_loss, train_time, pred_time = evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val, weight_val, budget,
task, eval_metric, train_loss=train_loss,
task, eval_metric, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
else:
val_loss, train_loss, train_time, pred_time = evaluate_model_CV(
estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, train_loss=train_loss,
eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time, pred_time


def evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val,
weight_val, budget, task, eval_metric, train_loss=False,
weight_val, budget, task, eval_metric, log_training_metric=False,
fit_kwargs={}
):
val_loss, train_time, train_loss, pred_time = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric,
task, budget=budget, train_loss=train_loss, fit_kwargs=fit_kwargs)
task, budget=budget, log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time, pred_time


def evaluate_model_CV(
estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss, train_loss=False, fit_kwargs={}
task, eval_metric, best_val_loss, log_training_metric=False, fit_kwargs={}
):
start_time = time.time()
total_val_loss = 0
total_train_loss = None
train_loss = None
train_time = pred_time = 0
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
Expand All @@ -231,7 +237,7 @@ def evaluate_model_CV(
kf = kf.split(X_train_split)
rng = np.random.RandomState(2020)
val_loss_list = []
budget_per_train = budget / (n + 1)
budget_per_train = budget / n
if 'sample_weight' in fit_kwargs:
weight = fit_kwargs['sample_weight']
weight_val = None
Expand Down Expand Up @@ -259,13 +265,13 @@ def evaluate_model_CV(
val_loss_i, train_time_i, train_loss_i, pred_time_i = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val,
eval_metric, task, labels, budget_per_train,
train_loss=train_loss, fit_kwargs=fit_kwargs)
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
if weight is not None:
fit_kwargs['sample_weight'] = weight
valid_fold_num += 1
total_fold_num += 1
total_val_loss += val_loss_i
if train_loss is not False:
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_train_loss, list):
total_train_loss = [
total_train_loss[i] + v for i, v in enumerate(train_loss_i)]
Expand All @@ -286,25 +292,25 @@ def evaluate_model_CV(
break
val_loss = np.max(val_loss_list)
n = total_fold_num
if train_loss is not False:
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_train_loss, list):
train_loss = [v / n for v in total_train_loss]
elif isinstance(total_train_loss, dict):
train_loss = {k: v / n for k, v in total_train_loss.items()}
else:
train_loss = total_train_loss / n
pred_time /= n
budget -= time.time() - start_time
if val_loss < best_val_loss and budget > budget_per_train:
estimator.cleanup()
estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
# budget -= time.time() - start_time
# if val_loss < best_val_loss and budget > budget_per_train:
# estimator.cleanup()
# estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
return val_loss, train_loss, train_time, pred_time


def compute_estimator(
X_train, y_train, X_val, y_val, weight_val, budget, kf,
config_dic, task, estimator_name, eval_method, eval_metric,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, train_loss=False,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
fit_kwargs={}
):
estimator_class = estimator_class or get_estimator_class(
Expand All @@ -313,7 +319,7 @@ def compute_estimator(
**config_dic, task=task, n_jobs=n_jobs)
val_loss, train_loss, train_time, pred_time = evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task,
eval_method, eval_metric, best_val_loss, train_loss=train_loss,
eval_method, eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
return estimator, val_loss, train_loss, train_time, pred_time

Expand Down
34 changes: 18 additions & 16 deletions flaml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,10 @@ def search_space(cls, data_size, **params):
'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
'init_value': 0.1,
},
'subsample': {
'domain': tune.uniform(lower=0.1, upper=1.0),
'init_value': 1.0,
},
# 'subsample': {
# 'domain': tune.uniform(lower=0.1, upper=1.0),
# 'init_value': 1.0,
# },
'log_max_bin': {
'domain': tune.lograndint(lower=3, upper=11),
'init_value': 8,
Expand All @@ -252,28 +252,30 @@ def size(cls, config):

def __init__(self, task='binary:logistic', log_max_bin=8, **params):
super().__init__(task, **params)
# Default: ‘regression’ for LGBMRegressor,
# ‘binary’ or ‘multiclass’ for LGBMClassifier
if 'regression' in task:
objective = 'regression'
elif 'binary' in task:
objective = 'binary'
elif 'multi' in task:
objective = 'multiclass'
else:
objective = 'regression'
if "objective" not in self.params:
# Default: ‘regression’ for LGBMRegressor,
# ‘binary’ or ‘multiclass’ for LGBMClassifier
if 'regression' in task:
objective = 'regression'
elif 'binary' in task:
objective = 'binary'
elif 'multi' in task:
objective = 'multiclass'
else:
objective = 'regression'
self.params["objective"] = objective
if "n_estimators" in self.params:
self.params["n_estimators"] = int(round(self.params["n_estimators"]))
if "num_leaves" in self.params:
self.params["num_leaves"] = int(round(self.params["num_leaves"]))
if "min_child_samples" in self.params:
self.params["min_child_samples"] = int(round(self.params["min_child_samples"]))
if "objective" not in self.params:
self.params["objective"] = objective
if "max_bin" not in self.params:
self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1
if "verbose" not in self.params:
self.params['verbose'] = -1
# if "subsample_freq" not in self.params:
# self.params['subsample_freq'] = 1
if 'regression' in task:
self.estimator_class = LGBMRegressor
else:
Expand Down
1 change: 1 addition & 0 deletions flaml/nlp/autotransformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,7 @@ def fit(self,
self._set_metric(custom_metric_name, custom_metric_mode_name)
self._set_task()
self._fp16 = fp16
ray.shutdown()
ray.init(local_mode=ray_local_mode)
self._set_search_space(**custom_hpo_args)

Expand Down
25 changes: 18 additions & 7 deletions flaml/searcher/flow2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the
* project root for license information.
'''
from flaml.tune.sample import Domain
from typing import Dict, Optional, Tuple
import numpy as np
try:
Expand Down Expand Up @@ -140,7 +141,7 @@ def _init_search(self):
if str(sampler) != 'Normal':
self._bounded_keys.append(key)
if not hier:
self._space_keys = sorted(self._space.keys())
self._space_keys = sorted(self._tunable_keys)
self._hierarchical = hier
if (self.prune_attr and self.prune_attr not in self._space
and self.max_resource):
Expand Down Expand Up @@ -499,18 +500,28 @@ def config_signature(self, config, space: Dict = None) -> tuple:
else:
space = self._space
value_list = []
# self._space_keys doesn't contain keys with const values,
# e.g., "eval_metric": ["logloss", "error"].
keys = sorted(config.keys()) if self._hierarchical else self._space_keys
for key in keys:
value = config[key]
if key == self.prune_attr:
value_list.append(value)
# else key must be in self.space
# get rid of list type or constant,
# e.g., "eval_metric": ["logloss", "error"]
elif isinstance(space[key], sample.Integer):
value_list.append(int(round(value)))
else:
value_list.append(value)
# key must be in space
domain = space[key]
if self._hierarchical:
# can't remove constant for hierarchical search space,
# e.g., learner
if not (domain is None or type(domain) in (str, int, float)
or isinstance(domain, sample.Domain)):
# not domain or hashable
# get rid of list type for hierarchical search space.
continue
if isinstance(domain, sample.Integer):
value_list.append(int(round(value)))
else:
value_list.append(value)
return tuple(value_list)

@property
Expand Down
9 changes: 5 additions & 4 deletions flaml/training_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self,
iter_per_learner: int,
logged_metric: float,
trial_time: float,
total_search_time: float,
wall_clock_time: float,
validation_loss,
config,
best_validation_loss,
Expand All @@ -27,7 +27,7 @@ def __init__(self,
self.iter_per_learner = iter_per_learner
self.logged_metric = logged_metric
self.trial_time = trial_time
self.total_search_time = total_search_time
self.wall_clock_time = wall_clock_time
self.validation_loss = validation_loss
self.config = config
self.best_validation_loss = best_validation_loss
Expand Down Expand Up @@ -71,7 +71,7 @@ def append(self,
it_counter: int,
train_loss: float,
trial_time: float,
total_search_time: float,
wall_clock_time: float,
validation_loss,
config,
best_validation_loss,
Expand All @@ -86,7 +86,7 @@ def append(self,
it_counter,
train_loss,
trial_time,
total_search_time,
wall_clock_time,
validation_loss,
config,
best_validation_loss,
Expand All @@ -95,6 +95,7 @@ def append(self,
sample_size)
if validation_loss < self.current_best_loss or \
validation_loss == self.current_best_loss and \
self.current_sample_size is not None and \
sample_size > self.current_sample_size:
self.current_best_loss = validation_loss
self.current_sample_size = sample_size
Expand Down
1 change: 1 addition & 0 deletions flaml/tune/space.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def indexof(domain: Dict, config: Dict) -> int:
continue
# print(domain.const[i])
if all(config[key] == value for key, value in domain.const[i].items()):
# assumption: the concatenation of constants is a unique identifier
return i
return None

Expand Down
2 changes: 1 addition & 1 deletion flaml/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.13"
__version__ = "0.6.0"
Loading

0 comments on commit a229a61

Please sign in to comment.