Fast topological features (#1252)

* add fast topo * fix fast topo * Automated autopep8 fixes * pep8 * add to initial assumption * make code more clear in `fit` method * add `fast_topoligical_features` to docs * fix aligment for pipeline builders in `TSForecastingAssumptions` * add topo to `FedotBuilder` docs * fix table in docs * add params to fast_topo * change params and add it to tuner search space * fix integration tests * add new param stride * fix param * dirty speedup * delete ica from initial assumption due to instability * fix documentation * fix test * delete fast_topo from assumption and delete fast_train tag --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
aimclub · Jan 27, 2024 · 5efd0fb · 5efd0fb
1 parent 5e726e9
commit 5efd0fb
Show file tree

Hide file tree

Showing 9 changed files with 118 additions and 9 deletions.
diff --git a/docs/source/introduction/fedot_features/automation_features.rst b/docs/source/introduction/fedot_features/automation_features.rst
@@ -69,7 +69,8 @@ FEDOT supports bunch of dimensionality preprocessing operations that can be be a
  `one_hot_encoding`,One-Hot Encoder, Feature encoding
  `label_encoding`,Label Encoder, Feature encoding
  `resample`,Imbalanced binary class transformation in classification, Data transformation
- `topological_features`,Calculation of topological features, only for time series,Data transformation
+ `topological_features`,Calculation of topological features,Time series transformation
+ `fast_topological_features`,Fast calculation of part of topological features,Time series transformation
 
 
 .. csv-table:: Feature transformation operations implementations
@@ -105,7 +106,8 @@ FEDOT supports bunch of dimensionality preprocessing operations that can be be a
  `one_hot_encoding`,`sklearn.preprocessing.OneHotEncoder`,
  `label_encoding`,`sklearn.preprocessing.LabelEncoder`,`fast_train` `*tree`
  `resample`,`FEDOT model using sklearn.utils.resample`,
- `topological_features`,FEDOT model,`ts`
+ `topological_features`,FEDOT model,`ts`,
+ `fast_topological_features`,FEDOT model,`ts`
 
 
 Models used

diff --git a/fedot/api/api_utils/assumptions/task_assumptions.py b/fedot/api/api_utils/assumptions/task_assumptions.py
@@ -52,7 +52,8 @@ class TSForecastingAssumptions(TaskAssumptions):
  def builders(self):
  return {
  'lagged_ridge':
- PipelineBuilder().add_sequence('lagged', 'ridge'),
+ PipelineBuilder()
+ .add_sequence('lagged', 'ridge'),
  'topological':
  PipelineBuilder()
  .add_node('lagged')

diff --git a/fedot/api/builder.py b/fedot/api/builder.py
@@ -329,6 +329,8 @@ def setup_pipeline_structure(
  - ``diff_filter`` -> Derivative Filter Transformation
  - ``cut`` -> Cut Transformation
  - ``exog_ts`` -> Exogeneus Transformation
+ - ``topological_features`` -> Topological features
+ - ``fast_topological_features`` -> Fast implementation of topological features
 
  max_depth: max depth of a pipeline. Defaults to ``6``.
 

diff --git a/fedot/core/operations/evaluation/common_preprocessing.py b/fedot/core/operations/evaluation/common_preprocessing.py
@@ -8,6 +8,9 @@
 from fedot.core.operations.evaluation.operation_implementations.data_operations.sklearn_transformations import \
  ImputationImplementation, KernelPCAImplementation, NormalizationImplementation, PCAImplementation, \
  PolyFeaturesImplementation, ScalingImplementation, FastICAImplementation
+from fedot.core.operations.evaluation.operation_implementations.\
+ data_operations.topological.fast_topological_extractor import \
+ FastTopologicalFeaturesImplementation
 from fedot.core.operations.evaluation.operation_implementations.data_operations.topological. \
  topological_extractor import TopologicalFeaturesImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
@@ -47,7 +50,8 @@ class FedotPreprocessingStrategy(EvaluationStrategy):
  'one_hot_encoding': OneHotEncodingImplementation,
  'label_encoding': LabelEncodingImplementation,
  'fast_ica': FastICAImplementation,
- 'topological_features': TopologicalFeaturesImplementation
+ 'topological_features': TopologicalFeaturesImplementation,
+ 'fast_topological_features': FastTopologicalFeaturesImplementation,
  }
 
  def __init__(self, operation_type: str, params: Optional[OperationParameters] = None):

diff --git a/...ation/operation_implementations/data_operations/topological/fast_topological_extractor.py b/...ation/operation_implementations/data_operations/topological/fast_topological_extractor.py
@@ -0,0 +1,59 @@
+from itertools import chain
+from typing import Optional
+
+import numpy as np
+from gph import ripser_parallel as ripser
+from joblib import Parallel, delayed
+
+from fedot.core.data.data import InputData, OutputData
+from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
+ DataOperationImplementation
+from fedot.core.operations.operation_parameters import OperationParameters
+
+
+class FastTopologicalFeaturesImplementation(DataOperationImplementation):
+ def __init__(self, params: Optional[OperationParameters] = None):
+ super().__init__(params)
+ self.window_size_as_share = params.get('window_size_as_share')
+ self.max_homology_dimension = params.get('max_homology_dimension')
+ self.metric = params.get('metric')
+ self.stride = params.get('stride')
+ self.n_jobs = params.get('n_jobs')
+ self.quantiles = (0.1, 0.25, 0.5, 0.75, 0.9)
+ self._shape = len(self.quantiles)
+ self._window_size = None
+
+ def fit(self, input_data: InputData):
+ self._window_size = int(input_data.features.shape[1] * self.window_size_as_share)
+ self._window_size = max(self._window_size, 2)
+ self._window_size = min(self._window_size, input_data.features.shape[1] - 2)
+ return self
+
+ def transform(self, input_data: InputData) -> OutputData:
+ features = input_data.features
+ with Parallel(n_jobs=self.n_jobs, prefer='processes') as parallel:
+ topological_features = parallel(delayed(self._extract_features)
+ (np.mean(features[i:i+2, ::self.stride], axis=0))
+ for i in range(0, features.shape[0], 2))
+ if len(topological_features) * 2 < features.shape[0]:
+ topological_features.append(topological_features[-1])
+ result = np.array(list(chain(*zip(topological_features, topological_features))))
+ if result.shape[0] > features.shape[0]:
+ result = result[:-1, :]
+ np.nan_to_num(result, copy=False, nan=0, posinf=0, neginf=0)
+ return result
+
+ def _extract_features(self, x):
+ x_sliced = np.array([x[i:self._window_size + i] for i in range(x.shape[0] - self._window_size + 1)])
+ x_processed = ripser(x_sliced,
+ maxdim=self.max_homology_dimension,
+ coeff=2,
+ metric=self.metric,
+ n_threads=1,
+ collapse_edges=False)["dgms"]
+ result = np.zeros(self._shape * (self.max_homology_dimension + 1))
+ for i, xp in enumerate(x_processed):
+ if xp.shape[0] > 0:
+ result[i * self._shape:(i + 1) * self._shape] = np.quantile(xp[:, 1] - xp[:, 0], self.quantiles,
+ overwrite_input=True, method='hazen')
+ return result
diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py
@@ -768,6 +768,22 @@ def get_parameters_dict(self):
  'sampling-scope': [0.9, 0.99],
  'type': 'continuous'}
  },
+ 'fast_topological_features': {
+ 'window_size_as_share': {
+ 'hyperopt-dist': hp.uniform,
+ 'sampling-scope': [0.1, 0.9],
+ 'type': 'continuous'
+ },
+ 'max_homology_dimension': {
+ 'hyperopt-dist': hp.uniformint,
+ 'sampling-scope': [1, 5],
+ 'type': 'discrete'
+ },
+ 'metric': {
+ 'hyperopt-dist': hp.choice,
+ 'sampling-scope': [['euclidean', 'manhattan', 'cosine']],
+ 'type': 'categorical'}
+ }
  }
 
  if self.custom_search_space is not None:

diff --git a/fedot/core/repository/data/data_operation_repository.json b/fedot/core/repository/data/data_operation_repository.json
@@ -256,6 +256,20 @@
  "presets": [
  "ts"
  ],
+ "input_type": "[DataTypesEnum.table]",
+ "output_type": "[DataTypesEnum.table]",
+ "tags": [
+ "non_applicable_for_ts",
+ "feature_space_transformation"
+ ]
+ },
+ "fast_topological_features": {
+ "meta": "custom_ts_preprocessing",
+ "presets": [
+ "ts"
+ ],
+ "input_type": "[DataTypesEnum.table]",
+ "output_type": "[DataTypesEnum.table]",
  "tags": [
  "non_applicable_for_ts",
  "feature_space_transformation"

diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json
@@ -160,5 +160,12 @@
  },
  "topological_features": {
  "n_jobs": -1
+ },
+ "fast_topological_features": {
+ "n_jobs": 1,
+ "window_size_as_share": 0.66,
+ "max_homology_dimension": 1,
+ "metric": "euclidean",
+ "stride": 1
  }
 }
diff --git a/test/integration/models/test_model.py b/test/integration/models/test_model.py
@@ -54,16 +54,16 @@ def get_data_for_testing(task_type, data_type, length=100, features_count=1,
  return None
 
  if task_type is TaskTypesEnum.ts_forecasting:
- task = Task(task_type, TsForecastingParams(max(length // 10, 2)))
+ forecast_length = max(length // 10, 2)
+ task = Task(task_type, TsForecastingParams(forecast_length))
  if data_type is DataTypesEnum.ts:
  features = np.zeros(length) + value
  else:
  features = np.zeros((length, features_count)) + value
  if data_type is DataTypesEnum.table:
- target = np.zeros(length) + value
+ target = np.zeros((length, forecast_length)) + value
  else:
  target = features
-
  else:
  task = Task(task_type)
  data_type = DataTypesEnum.table
@@ -156,11 +156,15 @@ def fit_time_for_operation(operation: OperationMetaInfo,
  return perf_counter() - start_time
 
  for task_type in operation.task_type:
- for data_type in operation.input_types:
+ input_types = operation.input_types
+ if task_type is TaskTypesEnum.ts_forecasting:
+ if operation.input_types == [DataTypesEnum.table]:
+ input_types = [DataTypesEnum.ts]
+ for data_type in input_types:
  perfomance_values = []
  for length in data_lengths:
  data = get_data_for_testing(task_type, data_type,
- length=length, features_count=2,
+ length=length, features_count=10,
  random=True)
  if data is not None:
  min_evaluated_time = min(fit_time_for_operation(operation, data) for _ in range(times))