Skip to content

Commit

Permalink
experiment support psi and feature_usage
Browse files Browse the repository at this point in the history
  • Loading branch information
oaksharks committed May 5, 2023
1 parent 1c3e870 commit f73040d
Show file tree
Hide file tree
Showing 18 changed files with 350 additions and 181 deletions.
48 changes: 38 additions & 10 deletions hypernets/core/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,54 @@


class Objective(metaclass=abc.ABCMeta):
""" Objective = Indicator metric + Direction
"""
""" Objective = Indicator metric + Direction"""

def __init__(self, name, direction):
def __init__(self, name, direction, need_train_data=False, need_val_data=True, need_test_data=False):
self.name = name
self.direction = direction
self.need_train_data = need_train_data
self.need_val_data = need_val_data
self.need_test_data = need_test_data

def evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float:
if self.need_test_data:
assert X_test is not None, "need test data"

if self.need_train_data:
assert X_train is not None and y_train is not None, "need train data"

if self.need_val_data:
assert X_val is not None and X_val is not None, "need validation data"

return self._evaluate(trial, estimator, X_train, y_train, X_val, y_val, X_test=X_test, **kwargs)

@abc.abstractmethod
def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float:
def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float:
raise NotImplementedError

def call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float:
assert len(estimators) == len(X_tests) == len(y_tests)
def evaluate_cv(self, trial, estimator, X_trains, y_trains,
X_vals, y_vals, X_test=None, **kwargs) -> float:

if self.need_test_data:
assert X_test is not None, "need test data"

if self.need_train_data:
assert X_trains is not None and y_trains is not None, "need train data"
assert len(X_trains) == len(y_trains)

if self.need_val_data:
assert X_vals is not None and y_vals is not None, "need validation data"
assert len(X_vals) == len(y_vals)

return self._call_cross_validation(trial=trial, estimators=estimators, X_tests=X_tests,
y_tests=y_tests, **kwargs)
return self._evaluate_cv(trial=trial, estimator=estimator, X_trains=X_trains, y_trains=y_trains,
X_vals=X_vals, y_vals=y_vals, X_test=X_test, **kwargs)

@abc.abstractmethod
def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float:
def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float:
raise NotImplementedError

def __repr__(self):
return f"{self.__class__.__name__}(name={self.name}, direction={self.direction})"
return f"{self.__class__.__name__}(name={self.name}, direction={self.direction}," \
f" need_train_data={self.need_train_data}," \
f" need_val_data={self.need_val_data}," \
f" need_test_data={self.need_test_data})"
15 changes: 10 additions & 5 deletions hypernets/core/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,23 @@ def set_meta_learner(self, meta_learner):
def parallelizable(self):
return False

def sample(self):
def sample(self, space_options=None):
raise NotImplementedError

def _random_sample(self):
space_sample = self.space_fn()
def _random_sample(self, **space_kwargs):
if space_kwargs is None:
space_kwargs = {}
space_sample = self.space_fn(**space_kwargs)
space_sample.random_sample()
return space_sample

def _sample_and_check(self, sample_fn):
def _sample_and_check(self, sample_fn, space_options=None):
if space_options is None:
space_options = {}

counter = 0
while True:
space_sample = sample_fn()
space_sample = sample_fn(**space_options)
counter += 1
if counter >= 1000:
raise ValueError('Unable to take valid sample and exceed the retry limit 1000.')
Expand Down
15 changes: 13 additions & 2 deletions hypernets/dispatchers/in_process_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from ..core.callbacks import EarlyStoppingError
from ..core.dispatcher import Dispatcher
from ..core.trial import Trial
from ..utils import logging, fs
from ..tabular import get_tool_box
from ..utils import logging, fs, const

logger = logging.get_logger(__name__)

Expand All @@ -24,10 +25,20 @@ def dispatch(self, hyper_model, X, y, X_eval, y_eval, X_test, cv, num_folds, max
trial_no = 1
retry_counter = 0

importances = None
if hyper_model.searcher.kind() == const.SEARCHER_MOO:
if 'feature_usage' in [_.name for _ in hyper_model.searcher.objectives]:
tb = get_tool_box(X, y)
preprocessor = tb.general_preprocessor(X)
estimator = tb.general_estimator(X, y, task=hyper_model.task)
estimator.fit(preprocessor.fit_transform(X, y), y)
importances = list(zip(estimator.feature_name_, estimator.feature_importances_))

while trial_no <= max_trials:
gc.collect()
try:
space_sample = hyper_model.searcher.sample()
space_options = dict(importances=importances)
space_sample = hyper_model.searcher.sample(space_options=space_options)
if hyper_model.history.is_existed(space_sample):
if retry_counter >= retry_limit:
logger.info(f'Unable to take valid sample and exceed the retry limit {retry_limit}.')
Expand Down
46 changes: 41 additions & 5 deletions hypernets/examples/plain_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,19 @@
from hypernets.core.ops import ModuleChoice, HyperInput, ModuleSpace
from hypernets.core.search_space import HyperSpace, Choice, Int, Real, Cascade, Constant, HyperNode
from hypernets.model import Estimator, HyperModel
from hypernets.tabular import get_tool_box
from hypernets.utils import fs, logging, const
from hypernets.tabular import get_tool_box, column_selector
from hypernets.utils import fs, const


from hypernets.core import randint
from hypernets.core.ops import ModuleChoice, HyperInput
from hypernets.core.search_space import HyperSpace, Choice, Int, Real
from hypernets.pipeline.base import DataFrameMapper
from hypernets.pipeline.transformers import FeatureImportanceSelection

from hypernets.utils import logging



logger = logging.get_logger(__name__)

Expand Down Expand Up @@ -114,6 +125,21 @@ def _cascade(fn, key, args, space):
kvalue = kvalue.value
return fn(kvalue)

def create_feature_selection(self, hyper_input, importances, seq_no=0):
from hypernets.pipeline.base import Pipeline

selection = FeatureImportanceSelection(name=f'feature_importance_selection_{seq_no}',
importances=importances,
quantile=Real(0, 1, step=0.1))
pipeline = Pipeline([selection],
name=f'feature_selection_{seq_no}',
columns=column_selector.column_all)(hyper_input)

preprocessor = DataFrameMapper(default=False, input_df=True, df_out=True,
df_out_dtype_transforms=None)([pipeline])

return preprocessor

# HyperSpace
def __call__(self, *args, **kwargs):
space = HyperSpace()
Expand All @@ -130,9 +156,14 @@ def __call__(self, *args, **kwargs):
estimators.append(self.lr)
if self.enable_nn:
estimators.append(self.nn)

modules = [ModuleSpace(name=f'{e["cls"].__name__}', **e) for e in estimators]
outputs = ModuleChoice(modules)(hyper_input)

if "importances" in kwargs and kwargs["importances"] is not None:
importances = kwargs.pop("importances")
ss = self.create_feature_selection(hyper_input, importances)
outputs = ModuleChoice(modules)(ss)
else:
outputs = ModuleChoice(modules)(hyper_input)
space.set_inputs(hyper_input)

return space
Expand Down Expand Up @@ -210,6 +241,8 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False
cv_models = []
x_vals = []
y_vals = []
X_trains = []
y_trains = []
logger.info('start training')
for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)):
x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx]
Expand Down Expand Up @@ -247,8 +280,11 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False
oof_[valid_idx] = proba
oof_scores.append(fold_scores)
cv_models.append(fold_model)

x_vals.append(x_val_fold)
y_vals.append(y_val_fold)
X_trains.append(x_train_fold)
y_trains.append(y_train_fold)

self.classes_ = getattr(cv_models[0], 'classes_', None)
self.cv_ = True
Expand All @@ -260,7 +296,7 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False

# return
oof_, = tb_original.from_local(oof_)
return scores, oof_, oof_scores, x_vals, y_vals
return scores, oof_, oof_scores, X_trains, y_trains, x_vals, y_vals

def predict(self, X, **kwargs):
eval_set = kwargs.pop('eval_set', None) # ignore
Expand Down
3 changes: 2 additions & 1 deletion hypernets/experiment/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from IPython.display import display

from hypernets.dispatchers.cfg import DispatchCfg
from hypernets.utils import logging, df_utils
from hypernets.utils import logging

logger = logging.get_logger(__name__)

Expand Down Expand Up @@ -68,6 +68,7 @@ def __init__(self, hyper_model, X_train, y_train, X_eval=None, y_eval=None, X_te
self.model_ = None

def get_data_character(self):
from hypernets.utils import df_utils
data_character = df_utils.get_data_character(self.hyper_model, self.X_train, self.y_train, self.X_eval,
self.y_eval, self.X_test, self.task)
return data_character
Expand Down
16 changes: 8 additions & 8 deletions hypernets/model/hyper_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,13 @@ def _run_trial(self, space_sample, trial_no, X, y, X_eval, y_eval, X_test=None,
oof_scores = None
x_vals = None
y_vals = None
X_trains = None
y_trains = None
try:
if cv:
scores, oof, oof_scores, x_vals, y_vals = estimator.fit_cross_validation(X, y, stratified=True,
num_folds=num_folds,
shuffle=False,
random_state=9527,
metrics=metrics,
**fit_kwargs)
ret_data = estimator.fit_cross_validation(X, y, stratified=True, num_folds=num_folds, shuffle=False,
random_state=9527, metrics=metrics, **fit_kwargs)
scores, oof, oof_scores, X_trains, y_trains, x_vals, y_vals = ret_data
else:
estimator.fit(X, y, **fit_kwargs)
succeeded = True
Expand Down Expand Up @@ -123,10 +122,11 @@ def _run_trial(self, space_sample, trial_no, X, y, X_eval, y_eval, X_test=None,
else:
if cv:
assert x_vals is not None and y_vals is not None
reward = [fn.call_cross_validation(trial, estimator.cv_models_, x_vals, y_vals)
reward = [fn.evaluate_cv(trial, estimator, X_trains, y_trains,
x_vals, y_vals, X_test)
for fn in self.searcher.objectives]
else:
reward = [fn.call(trial, estimator, X_eval, y_eval, X, y, X_test) for fn in self.searcher.objectives]
reward = [fn.evaluate(trial, estimator, X_eval, y_eval, X, y, X_test) for fn in self.searcher.objectives]

trial.reward = reward
trial.iteration_scores = estimator.get_iteration_scores()
Expand Down
Loading

0 comments on commit f73040d

Please sign in to comment.