diff --git a/hypernets/core/objective.py b/hypernets/core/objective.py index 455ecfe..f605db9 100644 --- a/hypernets/core/objective.py +++ b/hypernets/core/objective.py @@ -3,26 +3,54 @@ class Objective(metaclass=abc.ABCMeta): - """ Objective = Indicator metric + Direction - """ + """ Objective = Indicator metric + Direction""" - def __init__(self, name, direction): + def __init__(self, name, direction, need_train_data=False, need_val_data=True, need_test_data=False): self.name = name self.direction = direction + self.need_train_data = need_train_data + self.need_val_data = need_val_data + self.need_test_data = need_test_data + + def evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: + if self.need_test_data: + assert X_test is not None, "need test data" + + if self.need_train_data: + assert X_train is not None and y_train is not None, "need train data" + + if self.need_val_data: + assert X_val is not None and X_val is not None, "need validation data" + + return self._evaluate(trial, estimator, X_train, y_train, X_val, y_val, X_test=X_test, **kwargs) @abc.abstractmethod - def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float: + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: raise NotImplementedError - def call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: - assert len(estimators) == len(X_tests) == len(y_tests) + def evaluate_cv(self, trial, estimator, X_trains, y_trains, + X_vals, y_vals, X_test=None, **kwargs) -> float: + + if self.need_test_data: + assert X_test is not None, "need test data" + + if self.need_train_data: + assert X_trains is not None and y_trains is not None, "need train data" + assert len(X_trains) == len(y_trains) + + if self.need_val_data: + assert X_vals is not None and y_vals is not None, "need validation data" + assert len(X_vals) == len(y_vals) - return self._call_cross_validation(trial=trial, estimators=estimators, X_tests=X_tests, - y_tests=y_tests, **kwargs) + return self._evaluate_cv(trial=trial, estimator=estimator, X_trains=X_trains, y_trains=y_trains, + X_vals=X_vals, y_vals=y_vals, X_test=X_test, **kwargs) @abc.abstractmethod - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: raise NotImplementedError def __repr__(self): - return f"{self.__class__.__name__}(name={self.name}, direction={self.direction})" + return f"{self.__class__.__name__}(name={self.name}, direction={self.direction}," \ + f" need_train_data={self.need_train_data}," \ + f" need_val_data={self.need_val_data}," \ + f" need_test_data={self.need_test_data})" diff --git a/hypernets/core/searcher.py b/hypernets/core/searcher.py index 9683628..a385158 100644 --- a/hypernets/core/searcher.py +++ b/hypernets/core/searcher.py @@ -29,18 +29,23 @@ def set_meta_learner(self, meta_learner): def parallelizable(self): return False - def sample(self): + def sample(self, space_options=None): raise NotImplementedError - def _random_sample(self): - space_sample = self.space_fn() + def _random_sample(self, **space_kwargs): + if space_kwargs is None: + space_kwargs = {} + space_sample = self.space_fn(**space_kwargs) space_sample.random_sample() return space_sample - def _sample_and_check(self, sample_fn): + def _sample_and_check(self, sample_fn, space_options=None): + if space_options is None: + space_options = {} + counter = 0 while True: - space_sample = sample_fn() + space_sample = sample_fn(**space_options) counter += 1 if counter >= 1000: raise ValueError('Unable to take valid sample and exceed the retry limit 1000.') diff --git a/hypernets/dispatchers/in_process_dispatcher.py b/hypernets/dispatchers/in_process_dispatcher.py index 268e995..bf7dfe1 100644 --- a/hypernets/dispatchers/in_process_dispatcher.py +++ b/hypernets/dispatchers/in_process_dispatcher.py @@ -5,7 +5,8 @@ from ..core.callbacks import EarlyStoppingError from ..core.dispatcher import Dispatcher from ..core.trial import Trial -from ..utils import logging, fs +from ..tabular import get_tool_box +from ..utils import logging, fs, const logger = logging.get_logger(__name__) @@ -24,10 +25,20 @@ def dispatch(self, hyper_model, X, y, X_eval, y_eval, X_test, cv, num_folds, max trial_no = 1 retry_counter = 0 + importances = None + if hyper_model.searcher.kind() == const.SEARCHER_MOO: + if 'feature_usage' in [_.name for _ in hyper_model.searcher.objectives]: + tb = get_tool_box(X, y) + preprocessor = tb.general_preprocessor(X) + estimator = tb.general_estimator(X, y, task=hyper_model.task) + estimator.fit(preprocessor.fit_transform(X, y), y) + importances = list(zip(estimator.feature_name_, estimator.feature_importances_)) + while trial_no <= max_trials: gc.collect() try: - space_sample = hyper_model.searcher.sample() + space_options = dict(importances=importances) + space_sample = hyper_model.searcher.sample(space_options=space_options) if hyper_model.history.is_existed(space_sample): if retry_counter >= retry_limit: logger.info(f'Unable to take valid sample and exceed the retry limit {retry_limit}.') diff --git a/hypernets/examples/plain_model.py b/hypernets/examples/plain_model.py index fc87821..3e41613 100644 --- a/hypernets/examples/plain_model.py +++ b/hypernets/examples/plain_model.py @@ -13,8 +13,19 @@ from hypernets.core.ops import ModuleChoice, HyperInput, ModuleSpace from hypernets.core.search_space import HyperSpace, Choice, Int, Real, Cascade, Constant, HyperNode from hypernets.model import Estimator, HyperModel -from hypernets.tabular import get_tool_box -from hypernets.utils import fs, logging, const +from hypernets.tabular import get_tool_box, column_selector +from hypernets.utils import fs, const + + +from hypernets.core import randint +from hypernets.core.ops import ModuleChoice, HyperInput +from hypernets.core.search_space import HyperSpace, Choice, Int, Real +from hypernets.pipeline.base import DataFrameMapper +from hypernets.pipeline.transformers import FeatureImportanceSelection + +from hypernets.utils import logging + + logger = logging.get_logger(__name__) @@ -114,6 +125,21 @@ def _cascade(fn, key, args, space): kvalue = kvalue.value return fn(kvalue) + def create_feature_selection(self, hyper_input, importances, seq_no=0): + from hypernets.pipeline.base import Pipeline + + selection = FeatureImportanceSelection(name=f'feature_importance_selection_{seq_no}', + importances=importances, + quantile=Real(0, 1, step=0.1)) + pipeline = Pipeline([selection], + name=f'feature_selection_{seq_no}', + columns=column_selector.column_all)(hyper_input) + + preprocessor = DataFrameMapper(default=False, input_df=True, df_out=True, + df_out_dtype_transforms=None)([pipeline]) + + return preprocessor + # HyperSpace def __call__(self, *args, **kwargs): space = HyperSpace() @@ -130,9 +156,14 @@ def __call__(self, *args, **kwargs): estimators.append(self.lr) if self.enable_nn: estimators.append(self.nn) - modules = [ModuleSpace(name=f'{e["cls"].__name__}', **e) for e in estimators] - outputs = ModuleChoice(modules)(hyper_input) + + if "importances" in kwargs and kwargs["importances"] is not None: + importances = kwargs.pop("importances") + ss = self.create_feature_selection(hyper_input, importances) + outputs = ModuleChoice(modules)(ss) + else: + outputs = ModuleChoice(modules)(hyper_input) space.set_inputs(hyper_input) return space @@ -210,6 +241,8 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False cv_models = [] x_vals = [] y_vals = [] + X_trains = [] + y_trains = [] logger.info('start training') for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)): x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx] @@ -247,8 +280,11 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False oof_[valid_idx] = proba oof_scores.append(fold_scores) cv_models.append(fold_model) + x_vals.append(x_val_fold) y_vals.append(y_val_fold) + X_trains.append(x_train_fold) + y_trains.append(y_train_fold) self.classes_ = getattr(cv_models[0], 'classes_', None) self.cv_ = True @@ -260,7 +296,7 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False # return oof_, = tb_original.from_local(oof_) - return scores, oof_, oof_scores, x_vals, y_vals + return scores, oof_, oof_scores, X_trains, y_trains, x_vals, y_vals def predict(self, X, **kwargs): eval_set = kwargs.pop('eval_set', None) # ignore diff --git a/hypernets/experiment/_experiment.py b/hypernets/experiment/_experiment.py index a490478..598208f 100644 --- a/hypernets/experiment/_experiment.py +++ b/hypernets/experiment/_experiment.py @@ -8,7 +8,7 @@ from IPython.display import display from hypernets.dispatchers.cfg import DispatchCfg -from hypernets.utils import logging, df_utils +from hypernets.utils import logging logger = logging.get_logger(__name__) @@ -68,6 +68,7 @@ def __init__(self, hyper_model, X_train, y_train, X_eval=None, y_eval=None, X_te self.model_ = None def get_data_character(self): + from hypernets.utils import df_utils data_character = df_utils.get_data_character(self.hyper_model, self.X_train, self.y_train, self.X_eval, self.y_eval, self.X_test, self.task) return data_character diff --git a/hypernets/model/hyper_model.py b/hypernets/model/hyper_model.py index c855d62..60b9e1e 100644 --- a/hypernets/model/hyper_model.py +++ b/hypernets/model/hyper_model.py @@ -87,14 +87,13 @@ def _run_trial(self, space_sample, trial_no, X, y, X_eval, y_eval, X_test=None, oof_scores = None x_vals = None y_vals = None + X_trains = None + y_trains = None try: if cv: - scores, oof, oof_scores, x_vals, y_vals = estimator.fit_cross_validation(X, y, stratified=True, - num_folds=num_folds, - shuffle=False, - random_state=9527, - metrics=metrics, - **fit_kwargs) + ret_data = estimator.fit_cross_validation(X, y, stratified=True, num_folds=num_folds, shuffle=False, + random_state=9527, metrics=metrics, **fit_kwargs) + scores, oof, oof_scores, X_trains, y_trains, x_vals, y_vals = ret_data else: estimator.fit(X, y, **fit_kwargs) succeeded = True @@ -123,10 +122,11 @@ def _run_trial(self, space_sample, trial_no, X, y, X_eval, y_eval, X_test=None, else: if cv: assert x_vals is not None and y_vals is not None - reward = [fn.call_cross_validation(trial, estimator.cv_models_, x_vals, y_vals) + reward = [fn.evaluate_cv(trial, estimator, X_trains, y_trains, + x_vals, y_vals, X_test) for fn in self.searcher.objectives] else: - reward = [fn.call(trial, estimator, X_eval, y_eval, X, y, X_test) for fn in self.searcher.objectives] + reward = [fn.evaluate(trial, estimator, X_eval, y_eval, X, y, X_test) for fn in self.searcher.objectives] trial.reward = reward trial.iteration_scores = estimator.get_iteration_scores() diff --git a/hypernets/model/objectives.py b/hypernets/model/objectives.py index 3550a72..2f9a1d9 100644 --- a/hypernets/model/objectives.py +++ b/hypernets/model/objectives.py @@ -13,15 +13,6 @@ random_state = get_random_state() - -class ComplexityObjective(Objective, metaclass=abc.ABCMeta): - pass - - -class PerformanceObjective(Objective, metaclass=abc.ABCMeta): - pass - - def calc_psi(x_array, y_array, n_bins=10, eps=1e-6): def calc_ratio(y_proba): y_proba_1d = y_proba.reshape(1, -1) @@ -53,10 +44,53 @@ def calc_ratio(y_proba): return np.sum((train_ratio - test_ratio) * np.log(train_ratio / test_ratio)) + +def detect_used_features(estimator, X_data, sample_size=1000): + + if sample_size >= X_data.shape[0]: + sample_size = X_data.shape[0] + else: + sample_size = sample_size + + D: pd.DataFrame = X_data.sample(sample_size, random_state=random_state) + # D.reset_index(inplace=True, drop=True) + + y_pred = estimator.predict(D.copy()) # predict can modify D + NF = [] + for feature in X_data.columns: + unique = X_data[feature].unique() + n_unique = len(unique) + if n_unique < 2: # skip constant feature + continue + samples_inx = random_state.randint(low=0, high=n_unique - 1, size=D.shape[0]) + # transform inx that does not contain self + mapped_inx = [] + + for i, value in zip(samples_inx, D[feature].values): + j = int(np.where(unique == value)[0][0]) + if i >= j: + mapped_inx.append(i + 1) + else: + mapped_inx.append(i) + + D_ = D.copy() + D_[feature] = unique[mapped_inx] + + if (D_[feature] == D[feature]).values.any(): + raise RuntimeError("some samples have not been replaced by different value") + + y_pred_modified = estimator.predict(D_) + if (y_pred != y_pred_modified).any(): + NF.append(feature) + del D_ + + return NF + + class PSIObjective(Objective): def __init__(self, n_bins=10, task=const.TASK_BINARY, average='macro', eps=1e-6): - super(PSIObjective, self).__init__('psi', 'min') + super(PSIObjective, self).__init__('psi', 'min', need_train_data=True, need_val_data=False, need_test_data=True) if task == const.TASK_MULTICLASS and average != 'macro': raise RuntimeError("only 'macro' average is supported currently") if task not in [const.TASK_BINARY, const.TASK_MULTICLASS, const.TASK_REGRESSION]: @@ -66,13 +100,15 @@ def __init__(self, n_bins=10, task=const.TASK_BINARY, average='macro', eps=1e-6) self.average = average self.eps = eps - def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float: + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: + return self._get_psi_score(estimator, X_train, X_test) + + def _get_psi_score(self, estimator, X_train, X_test): def to_2d(array_data): if array_data.ndim == 1: return array_data.reshape((-1, 1)) else: return array_data - if self.task == const.TASK_BINARY: train_proba = estimator.predict_proba(X_train) test_proba = estimator.predict_proba(X_test) @@ -84,53 +120,57 @@ def to_2d(array_data): elif self.task == const.TASK_MULTICLASS: train_proba = estimator.predict_proba(X_train) test_proba = estimator.predict_proba(X_test) - psis = [float(calc_psi(to_2d(train_proba[:, i]), to_2d(test_proba[:, 1]))) for i in range(train_proba.shape[1])] + psis = [float(calc_psi(to_2d(train_proba[:, i]), to_2d(test_proba[:, 1]))) for i in + range(train_proba.shape[1])] return float(np.mean(psis)) else: raise RuntimeError(f"unseen task type {self.task}") - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: - raise NotImplementedError + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: + X_train = pd.concat(X_trains, axis=0) + return self._get_psi_score(estimator, X_train=X_train, X_test=X_test) class FeatureUsageObjective(Objective): def __init__(self): - super(FeatureUsageObjective, self).__init__('feature_usage', 'min') + super(FeatureUsageObjective, self).__init__('feature_usage', 'min', need_train_data=False, + need_val_data=True, need_test_data=False) - def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float: + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: return estimator.data_pipeline[0].features[0][1].steps[0][1].feature_usage() - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: - return estimators[0].data_pipeline[0].features[0][1].steps[0][1].feature_usage() + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: + return estimator.cv_models_[0].data_pipeline[0].features[0][1].steps[0][1].feature_usage() -class ElapsedObjective(PerformanceObjective): +class ElapsedObjective(Objective): def __init__(self): - super(ElapsedObjective, self).__init__(name='elapsed', direction='min') + super(ElapsedObjective, self).__init__(name='elapsed', direction='min', need_train_data=False, + need_val_data=False, need_test_data=False) - def call(self, trial, estimator, y_test, **kwargs): + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: return trial.elapsed - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: + def _evaluate_cv(self, trial, estimators, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: return trial.elapsed class PredictionPerformanceObjective(Objective): def __init__(self): - super(PredictionPerformanceObjective, self).__init__('pred_perf', 'min') + super(PredictionPerformanceObjective, self).__init__('pred_perf', 'min', need_train_data=False, + need_val_data=True, + need_test_data=False) - def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float: + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: t1 = time.time() - estimator.predict(X_test) + estimator.predict(X_val) return time.time() - t1 - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: t1 = time.time() - for estimator, X_test in zip(estimators, X_tests): - estimator.predict(X_test) - + estimator.predict(pd.concat(X_vals, axis=0)) return time.time() - t1 @@ -172,13 +212,14 @@ def predict_proba(self, X, **kwargs): return np.asarray(proba) -class PredictionObjective(PerformanceObjective): +class PredictionObjective(Objective): def __init__(self, name, scorer, direction=None): if direction is None: direction = 'max' if scorer._sign > 0 else 'min' - super(PredictionObjective, self).__init__(name, direction=direction) + super(PredictionObjective, self).__init__(name, direction=direction, need_train_data=False, + need_val_data=True, need_test_data=False) self._scorer = scorer @staticmethod @@ -261,29 +302,23 @@ def create(name, task=const.TASK_BINARY, pos_label=1, force_minimize=False): def get_score(self): return self._scorer - def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs): - value = self._scorer(estimator, X_eval, y_val) + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: + value = self._scorer(estimator, X_val, y_val) return value - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: - estimator = CVWrapperEstimator(estimators, X_tests, y_tests) - X_test = pd.concat(X_tests, axis=0) + estimator = CVWrapperEstimator(estimator.cv_models_, X_vals, y_vals) + X_test = pd.concat(X_vals, axis=0) - y_test = np.vstack(y_test.values.reshape((-1, 1)) if isinstance(y_test, pd.Series) else y_test.reshape((-1, 1)) for y_test in y_tests).reshape(-1, ) + y_test = np.vstack(y_test.values.reshape((-1, 1)) if isinstance(y_test, pd.Series) else y_test.reshape((-1, 1)) for y_test in y_vals).reshape(-1, ) return self._scorer(estimator, X_test, y_test) def __repr__(self): return f"{self.__class__.__name__}(name={self.name}, scorer={self._scorer}, direction={self.direction})" -class FeatureComplexityObjective(ComplexityObjective): - - def call(self, trial, estimator, y_test, **kwargs): - pass - - -class NumOfFeatures(ComplexityObjective): +class NumOfFeatures(Objective): """Detect the number of features used (NF) References: @@ -294,60 +329,23 @@ def __init__(self, sample_size=1000): super(NumOfFeatures, self).__init__('nf', 'min') self.sample_size = sample_size - def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float: - features = self.get_used_features(estimator=estimator, X_test=X_test) - return len(features) / len(X_test.columns) + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: + features = self.get_used_features(estimator=estimator, X_data=X_val) + return len(features) / len(X_val.columns) - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: - used_features = self.get_cv_used_features(estimators, X_tests) - return len(used_features) / len(X_tests[0].columns) + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: + used_features = self.get_cv_used_features(estimator, X_vals) + return len(used_features) / len(X_vals[0].columns) - def get_cv_used_features(self, estimators, X_tests): + def get_cv_used_features(self, estimator, X_datas): used_features = [] - for estimator, X_test in zip(estimators, X_tests): - features = self.get_used_features(estimator, X_test=X_test) + for X_data in X_datas: + features = self.get_used_features(estimator, X_data) used_features.extend(features) return list(set(used_features)) - def get_used_features(self, estimator, X_test): - if self.sample_size >= X_test.shape[0]: - sample_size = X_test.shape[0] - else: - sample_size = self.sample_size - - D: pd.DataFrame = X_test.sample(sample_size, random_state=random_state) - # D.reset_index(inplace=True, drop=True) - - y_pred = estimator.predict(D.copy()) # predict can modify D - NF = [] - for feature in X_test.columns: - unique = X_test[feature].unique() - n_unique = len(unique) - if n_unique < 2: # skip constant feature - continue - samples_inx = random_state.randint(low=0, high=n_unique - 1, size=D.shape[0]) - # transform inx that does not contain self - mapped_inx = [] - - for i, value in zip(samples_inx, D[feature].values): - j = int(np.where(unique == value)[0][0]) - if i >= j: - mapped_inx.append(i + 1) - else: - mapped_inx.append(i) - - D_ = D.copy() - D_[feature] = unique[mapped_inx] - - if (D_[feature] == D[feature]).values.any(): - raise RuntimeError("some samples have not been replaced by different value") - - y_pred_modified = estimator.predict(D_) - if (y_pred != y_pred_modified).any(): - NF.append(feature) - del D_ - - return NF + def get_used_features(self, estimator, X_data): + return detect_used_features(estimator, X_data, self.sample_size) def __repr__(self): return f"{self.__class__.__name__}(name={self.name}, sample_size={self.sample_size}, direction={self.direction})" diff --git a/hypernets/searchers/evolution_searcher.py b/hypernets/searchers/evolution_searcher.py index 42041c7..117afa0 100644 --- a/hypernets/searchers/evolution_searcher.py +++ b/hypernets/searchers/evolution_searcher.py @@ -140,7 +140,7 @@ def population_size(self): def parallelizable(self): return True - def sample(self): + def sample(self, space_options=None): if self.population.initializing: space_sample = self._sample_and_check(self._random_sample) return space_sample diff --git a/hypernets/searchers/grid_searcher.py b/hypernets/searchers/grid_searcher.py index 8a35ceb..3df0ffa 100644 --- a/hypernets/searchers/grid_searcher.py +++ b/hypernets/searchers/grid_searcher.py @@ -25,7 +25,7 @@ def __init__(self, space_fn, optimize_direction=OptimizeDirection.Minimize, spac def parallelizable(self): return True - def sample(self): + def sample(self, space_options=None): sample = self._sample_and_check(self._get_sample) return sample diff --git a/hypernets/searchers/mcts_searcher.py b/hypernets/searchers/mcts_searcher.py index bb84d08..6d479dd 100644 --- a/hypernets/searchers/mcts_searcher.py +++ b/hypernets/searchers/mcts_searcher.py @@ -52,7 +52,7 @@ def max_node_space(self): def parallelizable(self): return self.use_meta_learner and self.meta_learner is not None - def sample(self): + def sample(self, space_options=None): # print('Sample') _, best_node = self.tree.selection_and_expansion() # print(f'Sample: {best_node.info()}') diff --git a/hypernets/searchers/moead_searcher.py b/hypernets/searchers/moead_searcher.py index 995e42b..847e03a 100644 --- a/hypernets/searchers/moead_searcher.py +++ b/hypernets/searchers/moead_searcher.py @@ -264,7 +264,7 @@ def init_population(self, weight_vectors): return directions - def sample(self): + def sample(self, space_options=None): for direction in self.directions: if direction.individual is None: sample = self._sample_and_check(self._random_sample) diff --git a/hypernets/searchers/nsga_searcher.py b/hypernets/searchers/nsga_searcher.py index 9b07889..dcf7eb6 100644 --- a/hypernets/searchers/nsga_searcher.py +++ b/hypernets/searchers/nsga_searcher.py @@ -226,18 +226,21 @@ def binary_tournament_select(self, population): def directions(self): return [o.direction for o in self.objectives] - def sample(self): + def sample(self, space_options=None): + if space_options is None: + space_options = {} + if len(self.population) < self.survival.population_size: - return self._sample_and_check(self._random_sample) + return self._sample_and_check(self._random_sample, space_options=space_options) # binary tournament selection operation p1, p2 = self.binary_tournament_select(self.population) if self.recombination.check_parents(p1, p2): - offspring = self.recombination.do(p1, p2, self.space_fn()) - final_offspring = self.mutation.do(offspring, self.space_fn()) + offspring = self.recombination.do(p1, p2, self.space_fn(**space_options)) + final_offspring = self.mutation.do(offspring, self.space_fn(**space_options)) else: - final_offspring = self.mutation.do(p1.dna, self.space_fn(), proba=1) + final_offspring = self.mutation.do(p1.dna, self.space_fn(**space_options), proba=1) return final_offspring diff --git a/hypernets/searchers/playback_searcher.py b/hypernets/searchers/playback_searcher.py index ceb237d..3a16141 100644 --- a/hypernets/searchers/playback_searcher.py +++ b/hypernets/searchers/playback_searcher.py @@ -28,7 +28,7 @@ def __init__(self, history: TrialHistory, top_n=None, reverse=False, def parallelizable(self): return True - def sample(self): + def sample(self, space_options=None): if self.index >= len(self.samples): raise EarlyStoppingError('no more samples.') sample = self.samples[self.index] diff --git a/hypernets/searchers/random_searcher.py b/hypernets/searchers/random_searcher.py index 5ec4d9c..aa909b1 100644 --- a/hypernets/searchers/random_searcher.py +++ b/hypernets/searchers/random_searcher.py @@ -13,7 +13,7 @@ def __init__(self, space_fn, optimize_direction=OptimizeDirection.Minimize, spac def parallelizable(self): return True - def sample(self): + def sample(self, space_options=None): sample = self._sample_and_check(self._random_sample) return sample diff --git a/hypernets/tabular/sklearn_ex.py b/hypernets/tabular/sklearn_ex.py index 21e7ad8..5e5a841 100644 --- a/hypernets/tabular/sklearn_ex.py +++ b/hypernets/tabular/sklearn_ex.py @@ -1597,7 +1597,7 @@ def feature_usage(self): return len(self.important_features) / len(self.importances) def fit(self, X, y=None, **kwargs): - pass + return self def fit_transform(self, X, y=None, **kwargs): self.fit(X, y, **kwargs) diff --git a/hypernets/tests/experiment/compete_experiment_test.py b/hypernets/tests/experiment/compete_experiment_test.py index 0583d63..03c8728 100644 --- a/hypernets/tests/experiment/compete_experiment_test.py +++ b/hypernets/tests/experiment/compete_experiment_test.py @@ -8,13 +8,14 @@ from hypernets.core.objective import Objective from hypernets.examples.plain_model import PlainModel, PlainSearchSpace from hypernets.experiment import CompeteExperiment -from hypernets.model.objectives import ElapsedObjective +from hypernets.model.objectives import ElapsedObjective, PredictionObjective from hypernets.searchers.nsga_searcher import NSGAIISearcher from hypernets.tabular import get_tool_box from hypernets.tabular.datasets import dsutils from hypernets.tabular.sklearn_ex import MultiLabelEncoder from hypernets.tests.model.plain_model_test import create_plain_model from hypernets.tests.tabular.tb_dask import if_dask_ready, is_dask_installed, setup_dask +from hypernets.utils import const if is_dask_installed: import dask.dataframe as dd @@ -232,12 +233,12 @@ class PlainContextObjective(Objective): def __init__(self): super(PlainContextObjective, self).__init__('plain_context', 'min') - def call(self, trial, estimator, X_test, y_test, **kwargs) -> float: + def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float: exp = trial.context.get('exp') assert exp is not None and isinstance(exp, CompeteExperiment) # get experiment in Objective return np.random.random() - def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float: + def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float: exp = trial.context.get('exp') assert exp is not None and isinstance(exp, CompeteExperiment) # get experiment in Objective return np.random.random() @@ -245,7 +246,9 @@ def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) def test_moo_context(): search_space = PlainSearchSpace(enable_dt=True, enable_lr=False, enable_nn=True) - rs = NSGAIISearcher(search_space, objectives=[ElapsedObjective(), PlainContextObjective()], population_size=10) + rs = NSGAIISearcher(search_space, objectives=[PredictionObjective.create("auc", task=const.TASK_BINARY), + PlainContextObjective()], + population_size=10) hyper_model = PlainModel(rs, task='binary', callbacks=[SummaryCallback()], transformer=MultiLabelEncoder) @@ -269,7 +272,7 @@ def test_moo_context(): 'max_trials': 3, } from hypernets.tabular.metrics import metric_to_scoring - experiment = CompeteExperiment(hyper_model, X_train, y_train, scorer=metric_to_scoring("logloss"), **init_kwargs) + experiment = CompeteExperiment(hyper_model, X_train, y_train, scorer=metric_to_scoring("auc"), **init_kwargs) estimators = experiment.run(**run_kwargs) diff --git a/hypernets/tests/model/test_objectives.py b/hypernets/tests/model/test_objectives.py index b91e7c8..0114b48 100644 --- a/hypernets/tests/model/test_objectives.py +++ b/hypernets/tests/model/test_objectives.py @@ -6,7 +6,16 @@ import pytest -from hypernets.model.objectives import NumOfFeatures, PredictionPerformanceObjective, PredictionObjective, calc_psi +from hypernets.core import set_random_state, get_random_state +from hypernets.examples.plain_model import PlainSearchSpace, PlainModel +from hypernets.model.objectives import NumOfFeatures, PredictionPerformanceObjective, PredictionObjective, calc_psi, \ + PSIObjective, create_objective +from hypernets.tabular.datasets import dsutils +from hypernets.tabular.sklearn_ex import MultiLabelEncoder +from hypernets.searchers import NSGAIISearcher +from hypernets.searchers.genetic import create_recombination +from hypernets.tests.searchers.test_nsga2_searcher import get_bankdata +from hypernets.utils import const class BaseTestWithinModel: @@ -25,7 +34,7 @@ def create_model(self): lr = DecisionTreeRegressor(max_depth=2) lr.fit(X_train, y_train) - return lr, X_test, y_test + return lr, X_train, X_test, y_train, y_test def create_cv_models(self): X_train, X_test, y_train, y_test = self.create_mock_dataset() @@ -34,40 +43,55 @@ def create_cv_models(self): lr2 = DecisionTreeRegressor(max_depth=2).fit(X_train, y_train) lr3 = DecisionTreeRegressor(max_depth=3).fit(X_train, y_train) - return [lr1, lr2, lr3], [X_test] * 3, [y_test] * 3 + return [lr1, lr2, lr3], [X_train] * 3, [y_train] * 3, [X_test] * 3, [y_test] * 3 class TestNumOfFeatures(BaseTestWithinModel): def test_call(self): - lr, X_test, y_test = self.create_model() + lr, X_train, X_test, y_train, y_test = self.create_model() nof = NumOfFeatures() - score = nof.call(trial=None, estimator=lr, X_test=X_test, y_test=y_test) + score = nof.evaluate(trial=None, estimator=lr, X_val=X_test, y_val=y_test, X_train=None, y_train=None, X_test=None) assert score < 1 # only 2 features used - features = nof.get_used_features(estimator=lr, X_test=X_test) + features = nof.get_used_features(estimator=lr, X_data=X_test) assert 'log' in set(features) or 'exp' in set(features) def test_call_cross_validation(self): - estimators, X_tests, y_tests = self.create_cv_models() + estimators, X_trians, y_trains, X_tests, y_tests = self.create_cv_models() nof = NumOfFeatures() - score = nof.call_cross_validation(trial=None, estimators=estimators, X_tests=X_tests, y_tests=y_tests) + score = nof.evaluate_cv(trial=None, estimator=estimators[0], X_trains=X_trians, y_trains=y_trains, + X_vals=X_tests, y_vals=y_tests, X_test=None) assert 0 < score < 1 # only 2 features used - features = nof.get_cv_used_features(estimators=estimators, X_tests=X_tests) + features = nof.get_cv_used_features(estimator=estimators[0], X_datas=X_tests) assert 'log' in set(features) or 'exp' in set(features) +class FakeCVEstimator: + + def __init__(self, estimators): + self.cv_models_ = estimators + + def predict(self, *args, **kwargs): + return self.cv_models_[0].predict(*args, **kwargs) + + def predict_proba(self, *args, **kwargs): + return self.cv_models_[0].predict_proba(*args, **kwargs) + + class TestPredictionPerformanceObjective(BaseTestWithinModel): def test_call(self): - lr, X_test, y_test = self.create_model() + lr, X_train, X_test, y_train, y_test = self.create_model() ppo = PredictionPerformanceObjective() - score = ppo.call(trial=None, estimator=lr, X_test=X_test, y_test=y_test) + score = ppo.evaluate(trial=None, estimator=lr, X_val=X_test, y_val=y_test, X_train=None, y_train=None, X_test=None) assert score is not None def test_call_cross_validation(self): - estimators, X_tests, y_tests = self.create_cv_models() + estimators, X_trians, y_trains, X_tests, y_tests = self.create_cv_models() ppo = PredictionPerformanceObjective() - score = ppo.call_cross_validation(trial=None, estimators=estimators, X_tests=X_tests, y_tests=y_tests) + FakeCVEstimator(estimators) + score = ppo.evaluate_cv(trial=None, estimator=FakeCVEstimator(estimators), + X_trains=None, y_trains=None, X_vals=X_tests, y_vals=y_tests, X_test=None) assert score is not None @@ -108,10 +132,14 @@ def create_cv_objective(self, metric_name, force_minimize): estimator3 = FakeEstimator(class_=np.array([0, 1]), proba=y_proba3) estimators = [estimator1, estimator2, estimator3] X_tests = [pd.DataFrame(data=np.random.random((6, 2)), columns=['c1', 'c2'])] * 3 + y_tests_array = np.random.binomial(n=1, p=0.5, size=(3, n_rows)) + y_tests = [] + for _ in y_tests_array: + y_tests.append(_) objective = PredictionObjective.create(name=metric_name, force_minimize=force_minimize) - score = objective.call_cross_validation(trial=None, estimators=estimators, - X_tests=X_tests, y_tests=y_trues) + score = objective.evaluate_cv(trial=None, estimator=FakeCVEstimator(estimators), + X_trains=None, y_trains=None, X_vals=X_tests, y_vals=y_tests, X_test=None) return objective, score @pytest.mark.parametrize('metric_name', ['logloss', 'auc', 'f1', 'precision', 'recall', 'accuracy']) @@ -141,12 +169,11 @@ def test_create(self, metric_name: str, force_minimize: bool, cv: bool): assert score > 0 -class TestPSIObjective: +class TestPSIObjective(BaseTestWithinModel): def test_calc_psi(self): x_array = np.random.random((100, 1)) y_array = np.random.random((100, 1)) - psi1 = calc_psi(x_array, y_array, n_bins=10, eps=1e-6) psi2 = calc_psi(x_array * 10, y_array * 5, n_bins=10, eps=1e-6) assert psi1 > 0 @@ -154,3 +181,57 @@ def test_calc_psi(self): assert psi2 > psi1 print(psi1) + def test_call(self): + lr, X_train, X_test, y_train, y_test = self.create_model() + po = PSIObjective(n_bins=10, task=const.TASK_REGRESSION, average='macro', eps=1e-6) + score = po.evaluate(trial=None, estimator=lr, X_val=None, y_val=None, X_train=X_train, + y_train=y_train, X_test=X_test) + assert score is not None + + def test_call_cross_validation(self): + estimators, X_trians, y_trains, X_tests, y_tests = self.create_cv_models() + ppo = PSIObjective(n_bins=10, task=const.TASK_REGRESSION, average='macro', eps=1e-6) + score = ppo.evaluate_cv(trial=None, estimator=estimators[0], X_trains=X_trians, + y_trains=y_trains, X_vals=None, y_vals=None, X_test=X_tests[0]) + assert score is not None + + def test_search(self): + set_random_state(1234) + X_train, y_train, X_test, y_test = get_bankdata() + recombination_ins = create_recombination('shuffle', random_state=get_random_state()) + search_space = PlainSearchSpace(enable_dt=True, enable_lr=False, enable_nn=True) + rs = NSGAIISearcher(search_space, objectives=[PredictionObjective.create('accuracy'), + create_objective('psi')], + recombination=recombination_ins, population_size=3) + + # the given reward_metric is in order to ensure SOO working, make it's the same as metrics in MOO searcher + hk = PlainModel(rs, task='binary', transformer=MultiLabelEncoder, reward_metric='logloss') + + hk.search(X_train, y_train, X_test, y_test, X_test=X_test.copy(), max_trials=5, cv=True) + + len(hk.history.trials) + assert hk.get_best_trial() + + def test_search_multi_classification(self): + set_random_state(1234) + + df = dsutils.load_glass_uci() + df.columns = [f'x_{c}' for c in df.columns] + y = df.pop('x_10') + + X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2) + + recombination_ins = create_recombination('shuffle', random_state=get_random_state()) + search_space = PlainSearchSpace(enable_dt=True, enable_lr=False, enable_nn=True) + rs = NSGAIISearcher(search_space, objectives=[PredictionObjective.create('accuracy'), + create_objective('psi')], + recombination=recombination_ins, population_size=3) + + # the given reward_metric is in order to ensure SOO working, make it's the same as metrics in MOO searcher + hk = PlainModel(rs, task='binary', transformer=MultiLabelEncoder, reward_metric='logloss') + + hk.search(X_train, y_train, X_test, y_test, X_test=X_test.copy(), max_trials=5, cv=True) + + len(hk.history.trials) + assert hk.get_best_trial() + diff --git a/hypernets/tests/searchers/test_nsga2_searcher.py b/hypernets/tests/searchers/test_nsga2_searcher.py index 7a7e930..111531d 100644 --- a/hypernets/tests/searchers/test_nsga2_searcher.py +++ b/hypernets/tests/searchers/test_nsga2_searcher.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from hypernets.model.objectives import ElapsedObjective,\ - PredictionObjective, NumOfFeatures, PredictionPerformanceObjective +from hypernets.model.objectives import ElapsedObjective, \ + PredictionObjective, NumOfFeatures, PredictionPerformanceObjective, create_objective from hypernets.searchers.nsga_searcher import NSGAIISearcher, _NSGAIndividual, _RankAndCrowdSortSurvival, \ _RDominanceSurvival, RNSGAIISearcher @@ -94,13 +94,15 @@ class TestNSGA2: @pytest.mark.parametrize('recombination', ["shuffle", "uniform", "single_point"]) @pytest.mark.parametrize('cv', [True, False]) + #@pytest.mark.parametrize('objective', ['feature_usage', 'nf']) def test_nsga2_training(self, recombination: str, cv: bool): + objective = 'nf' set_random_state(1234) X_train, y_train, X_test, y_test = get_bankdata() recombination_ins = create_recombination(recombination, random_state=get_random_state()) search_space = PlainSearchSpace(enable_dt=True, enable_lr=False, enable_nn=True) rs = NSGAIISearcher(search_space, objectives=[PredictionObjective.create('accuracy'), - PredictionPerformanceObjective()], + create_objective(objective)], recombination=recombination_ins, population_size=3) # the given reward_metric is in order to ensure SOO working, make it's the same as metrics in MOO searcher @@ -133,16 +135,15 @@ def test_non_consistent_direction(self): class TestRNSGA2: - # def setup_method(self): - # set_random_state(1234) - @pytest.mark.parametrize('recombination', ["shuffle", "uniform", "single_point"]) @pytest.mark.parametrize('cv', [True, False]) - def test_nsga2_training(self, recombination: str, cv: bool): + @pytest.mark.parametrize('objective', ['feature_usage', 'nf']) + def test_nsga2_training(self, recombination: str, cv: bool, objective: str): set_random_state(1234) - hk1 = self.run_nsga2_training(recombination=const.COMBINATION_SHUFFLE, cv=cv) + hk1 = self.run_nsga2_training(recombination=const.COMBINATION_SHUFFLE, cv=cv, objective=objective) pop1 = hk1.searcher.get_historical_population() scores1 = np.asarray([indi.scores for indi in pop1]) + assert scores1.ndim == 2 # test search process reproduce by setting random_state # set_random_state(1234) # reset random state @@ -152,25 +153,27 @@ def test_nsga2_training(self, recombination: str, cv: bool): # # assert (scores1 == scores2).all() - def reproce_nsga2_training(self): - set_random_state(1234) - hk1 = self.run_nsga2_training(recombination=const.COMBINATION_UNIFORM) - pop1 = hk1.searcher.get_historical_population() - scores1 = np.asarray([indi.scores for indi in pop1]) - - # test search process reproduce by setting random_state - set_random_state(1234) # reset random state - hk2 = self.run_nsga2_training(recombination=const.COMBINATION_UNIFORM) - pop2 = hk2.searcher.get_historical_population() - scores2 = np.asarray([indi.scores for indi in pop2]) - - assert (scores1 == scores2).all() + # def reproce_nsga2_training(self): + # set_random_state(1234) + # hk1 = self.run_nsga2_training(recombination=const.COMBINATION_UNIFORM) + # pop1 = hk1.searcher.get_historical_population() + # scores1 = np.asarray([indi.scores for indi in pop1]) + # + # # test search process reproduce by setting random_state + # set_random_state(1234) # reset random state + # hk2 = self.run_nsga2_training(recombination=const.COMBINATION_UNIFORM) + # pop2 = hk2.searcher.get_historical_population() + # scores2 = np.asarray([indi.scores for indi in pop2]) + # + # assert (scores1 == scores2).all() - def run_nsga2_training(self, recombination: str, cv: bool): + def run_nsga2_training(self, recombination: str, cv: bool, objective: str): random_state = get_random_state() X_train, y_train, X_test, y_test = get_bankdata() search_space = PlainSearchSpace(enable_dt=True, enable_lr=False, enable_nn=True) - rs = RNSGAIISearcher(search_space, objectives=[PredictionObjective.create('logloss'), NumOfFeatures()], + + rs = RNSGAIISearcher(search_space, objectives=[PredictionObjective.create('logloss'), + create_objective(objective)], ref_point=[0.5, 0.5], weights=[0.4, 0.6], random_state=random_state,