experiment support psi and feature_usage

DataCanvasIO · May 5, 2023 · f73040d · f73040d
1 parent 1c3e870
commit f73040d
Show file tree

Hide file tree

Showing 18 changed files with 350 additions and 181 deletions.
diff --git a/hypernets/core/objective.py b/hypernets/core/objective.py
@@ -3,26 +3,54 @@
 
 
 class Objective(metaclass=abc.ABCMeta):
- """ Objective = Indicator metric + Direction
- """
+ """ Objective = Indicator metric + Direction"""
 
- def __init__(self, name, direction):
+ def __init__(self, name, direction, need_train_data=False, need_val_data=True, need_test_data=False):
  self.name = name
  self.direction = direction
+ self.need_train_data = need_train_data
+ self.need_val_data = need_val_data
+ self.need_test_data = need_test_data
+
+ def evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float:
+ if self.need_test_data:
+ assert X_test is not None, "need test data"
+
+ if self.need_train_data:
+ assert X_train is not None and y_train is not None, "need train data"
+
+ if self.need_val_data:
+ assert X_val is not None and X_val is not None, "need validation data"
+
+ return self._evaluate(trial, estimator, X_train, y_train, X_val, y_val, X_test=X_test, **kwargs)
 
  @abc.abstractmethod
- def call(self, trial, estimator, X_eval, y_val, X_train, y_train, X_test, **kwargs) -> float:
+ def _evaluate(self, trial, estimator, X_train, y_train, X_val, y_val, X_test=None, **kwargs) -> float:
  raise NotImplementedError
 
- def call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float:
- assert len(estimators) == len(X_tests) == len(y_tests)
+ def evaluate_cv(self, trial, estimator, X_trains, y_trains,
+ X_vals, y_vals, X_test=None, **kwargs) -> float:
+
+ if self.need_test_data:
+ assert X_test is not None, "need test data"
+
+ if self.need_train_data:
+ assert X_trains is not None and y_trains is not None, "need train data"
+ assert len(X_trains) == len(y_trains)
+
+ if self.need_val_data:
+ assert X_vals is not None and y_vals is not None, "need validation data"
+ assert len(X_vals) == len(y_vals)
 
- return self._call_cross_validation(trial=trial, estimators=estimators, X_tests=X_tests,
-  y_tests=y_tests, **kwargs)
+ return self._evaluate_cv(trial=trial, estimator=estimator, X_trains=X_trains, y_trains=y_trains,
+ X_vals=X_vals, y_vals=y_vals, X_test=X_test, **kwargs)
 
  @abc.abstractmethod
- def _call_cross_validation(self, trial, estimators, X_tests, y_tests, **kwargs) -> float:
+ def _evaluate_cv(self, trial, estimator, X_trains, y_trains, X_vals, y_vals, X_test=None, **kwargs) -> float:
  raise NotImplementedError
 
  def __repr__(self):
- return f"{self.__class__.__name__}(name={self.name}, direction={self.direction})"
+ return f"{self.__class__.__name__}(name={self.name}, direction={self.direction}," \
+ f" need_train_data={self.need_train_data}," \
+ f" need_val_data={self.need_val_data}," \
+ f" need_test_data={self.need_test_data})"
diff --git a/hypernets/core/searcher.py b/hypernets/core/searcher.py
@@ -29,18 +29,23 @@ def set_meta_learner(self, meta_learner):
  def parallelizable(self):
  return False
 
- def sample(self):
+ def sample(self, space_options=None):
  raise NotImplementedError
 
- def _random_sample(self):
- space_sample = self.space_fn()
+ def _random_sample(self, **space_kwargs):
+ if space_kwargs is None:
+ space_kwargs = {}
+ space_sample = self.space_fn(**space_kwargs)
  space_sample.random_sample()
  return space_sample
 
- def _sample_and_check(self, sample_fn):
+ def _sample_and_check(self, sample_fn, space_options=None):
+ if space_options is None:
+ space_options = {}
+
  counter = 0
  while True:
- space_sample = sample_fn()
+ space_sample = sample_fn(**space_options)
  counter += 1
  if counter >= 1000:
  raise ValueError('Unable to take valid sample and exceed the retry limit 1000.')

diff --git a/hypernets/dispatchers/in_process_dispatcher.py b/hypernets/dispatchers/in_process_dispatcher.py
@@ -5,7 +5,8 @@
 from ..core.callbacks import EarlyStoppingError
 from ..core.dispatcher import Dispatcher
 from ..core.trial import Trial
-from ..utils import logging, fs
+from ..tabular import get_tool_box
+from ..utils import logging, fs, const
 
 logger = logging.get_logger(__name__)
 
@@ -24,10 +25,20 @@ def dispatch(self, hyper_model, X, y, X_eval, y_eval, X_test, cv, num_folds, max
  trial_no = 1
  retry_counter = 0
 
+ importances = None
+ if hyper_model.searcher.kind() == const.SEARCHER_MOO:
+ if 'feature_usage' in [_.name for _ in hyper_model.searcher.objectives]:
+ tb = get_tool_box(X, y)
+ preprocessor = tb.general_preprocessor(X)
+ estimator = tb.general_estimator(X, y, task=hyper_model.task)
+ estimator.fit(preprocessor.fit_transform(X, y), y)
+ importances = list(zip(estimator.feature_name_, estimator.feature_importances_))
+
  while trial_no <= max_trials:
  gc.collect()
  try:
- space_sample = hyper_model.searcher.sample()
+ space_options = dict(importances=importances)
+ space_sample = hyper_model.searcher.sample(space_options=space_options)
  if hyper_model.history.is_existed(space_sample):
  if retry_counter >= retry_limit:
  logger.info(f'Unable to take valid sample and exceed the retry limit {retry_limit}.')

diff --git a/hypernets/examples/plain_model.py b/hypernets/examples/plain_model.py
@@ -13,8 +13,19 @@
 from hypernets.core.ops import ModuleChoice, HyperInput, ModuleSpace
 from hypernets.core.search_space import HyperSpace, Choice, Int, Real, Cascade, Constant, HyperNode
 from hypernets.model import Estimator, HyperModel
-from hypernets.tabular import get_tool_box
-from hypernets.utils import fs, logging, const
+from hypernets.tabular import get_tool_box, column_selector
+from hypernets.utils import fs, const
+
+
+from hypernets.core import randint
+from hypernets.core.ops import ModuleChoice, HyperInput
+from hypernets.core.search_space import HyperSpace, Choice, Int, Real
+from hypernets.pipeline.base import DataFrameMapper
+from hypernets.pipeline.transformers import FeatureImportanceSelection
+
+from hypernets.utils import logging
+
+
 
 logger = logging.get_logger(__name__)
 
@@ -114,6 +125,21 @@ def _cascade(fn, key, args, space):
  kvalue = kvalue.value
  return fn(kvalue)
 
+ def create_feature_selection(self, hyper_input, importances, seq_no=0):
+ from hypernets.pipeline.base import Pipeline
+
+ selection = FeatureImportanceSelection(name=f'feature_importance_selection_{seq_no}',
+ importances=importances,
+ quantile=Real(0, 1, step=0.1))
+ pipeline = Pipeline([selection],
+ name=f'feature_selection_{seq_no}',
+ columns=column_selector.column_all)(hyper_input)
+
+ preprocessor = DataFrameMapper(default=False, input_df=True, df_out=True,
+ df_out_dtype_transforms=None)([pipeline])
+
+ return preprocessor
+
  # HyperSpace
  def __call__(self, *args, **kwargs):
  space = HyperSpace()
@@ -130,9 +156,14 @@ def __call__(self, *args, **kwargs):
  estimators.append(self.lr)
  if self.enable_nn:
  estimators.append(self.nn)
-
  modules = [ModuleSpace(name=f'{e["cls"].__name__}', **e) for e in estimators]
- outputs = ModuleChoice(modules)(hyper_input)
+
+ if "importances" in kwargs and kwargs["importances"] is not None:
+ importances = kwargs.pop("importances")
+ ss = self.create_feature_selection(hyper_input, importances)
+ outputs = ModuleChoice(modules)(ss)
+ else:
+ outputs = ModuleChoice(modules)(hyper_input)
  space.set_inputs(hyper_input)
 
  return space
@@ -210,6 +241,8 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False
  cv_models = []
  x_vals = []
  y_vals = []
+ X_trains = []
+ y_trains = []
  logger.info('start training')
  for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)):
  x_train_fold, y_train_fold = X.iloc[train_idx], y[train_idx]
@@ -247,8 +280,11 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False
  oof_[valid_idx] = proba
  oof_scores.append(fold_scores)
  cv_models.append(fold_model)
+
  x_vals.append(x_val_fold)
  y_vals.append(y_val_fold)
+ X_trains.append(x_train_fold)
+ y_trains.append(y_train_fold)
 
  self.classes_ = getattr(cv_models[0], 'classes_', None)
  self.cv_ = True
@@ -260,7 +296,7 @@ def fit_cross_validation(self, X, y, stratified=True, num_folds=3, shuffle=False
 
  # return
  oof_, = tb_original.from_local(oof_)
- return scores, oof_, oof_scores, x_vals, y_vals
+ return scores, oof_, oof_scores, X_trains, y_trains, x_vals, y_vals
 
  def predict(self, X, **kwargs):
  eval_set = kwargs.pop('eval_set', None) # ignore

diff --git a/hypernets/experiment/_experiment.py b/hypernets/experiment/_experiment.py
@@ -8,7 +8,7 @@
 from IPython.display import display
 
 from hypernets.dispatchers.cfg import DispatchCfg
-from hypernets.utils import logging, df_utils
+from hypernets.utils import logging
 
 logger = logging.get_logger(__name__)
 
@@ -68,6 +68,7 @@ def __init__(self, hyper_model, X_train, y_train, X_eval=None, y_eval=None, X_te
  self.model_ = None
 
  def get_data_character(self):
+ from hypernets.utils import df_utils
  data_character = df_utils.get_data_character(self.hyper_model, self.X_train, self.y_train, self.X_eval,
  self.y_eval, self.X_test, self.task)
  return data_character

diff --git a/hypernets/model/hyper_model.py b/hypernets/model/hyper_model.py
@@ -87,14 +87,13 @@ def _run_trial(self, space_sample, trial_no, X, y, X_eval, y_eval, X_test=None,
  oof_scores = None
  x_vals = None
  y_vals = None
+ X_trains = None
+ y_trains = None
  try:
  if cv:
- scores, oof, oof_scores, x_vals, y_vals = estimator.fit_cross_validation(X, y, stratified=True,
- num_folds=num_folds,
- shuffle=False,
- random_state=9527,
- metrics=metrics,
- **fit_kwargs)
+ ret_data = estimator.fit_cross_validation(X, y, stratified=True, num_folds=num_folds, shuffle=False,
+ random_state=9527, metrics=metrics, **fit_kwargs)
+ scores, oof, oof_scores, X_trains, y_trains, x_vals, y_vals = ret_data
  else:
  estimator.fit(X, y, **fit_kwargs)
  succeeded = True
@@ -123,10 +122,11 @@ def _run_trial(self, space_sample, trial_no, X, y, X_eval, y_eval, X_test=None,
  else:
  if cv:
  assert x_vals is not None and y_vals is not None
- reward = [fn.call_cross_validation(trial, estimator.cv_models_, x_vals, y_vals)
+ reward = [fn.evaluate_cv(trial, estimator, X_trains, y_trains,
+ x_vals, y_vals, X_test)
  for fn in self.searcher.objectives]
  else:
- reward = [fn.call(trial, estimator, X_eval, y_eval, X, y, X_test) for fn in self.searcher.objectives]
+ reward = [fn.evaluate(trial, estimator, X_eval, y_eval, X, y, X_test) for fn in self.searcher.objectives]
 
  trial.reward = reward
  trial.iteration_scores = estimator.get_iteration_scores()