From 7a1da9df216503b5b0a4b7f1881cb2aea71951d1 Mon Sep 17 00:00:00 2001 From: Maxim Voronin Date: Mon, 10 Jul 2023 21:22:12 +0300 Subject: [PATCH] Refactored the code, added a description --- roerich/algorithms/calc_metrics.py | 74 +++++++++++++++++++++--------- roerich/algorithms/cpdc.py | 2 +- roerich/change_point/__init__.py | 4 +- roerich/scores/__init__.py | 5 +- roerich/scores/energy.py | 10 ++++ 5 files changed, 68 insertions(+), 27 deletions(-) create mode 100644 roerich/scores/energy.py diff --git a/roerich/algorithms/calc_metrics.py b/roerich/algorithms/calc_metrics.py index 7a3883f..797d217 100644 --- a/roerich/algorithms/calc_metrics.py +++ b/roerich/algorithms/calc_metrics.py @@ -1,38 +1,68 @@ -from abc import ABCMeta, abstractmethod -import numpy as np -import pandas as pd -from sklearn.metrics import pairwise_distances, roc_curve, roc_auc_score -from sklearn.model_selection import StratifiedKFold, KFold, train_test_split -from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis -from copy import deepcopy -from joblib import Parallel, delayed -from scipy import interpolate -from scipy.signal import argrelmax - from .cpdc import ChangePointDetectionBase from roerich.scores.fd import frechet_distance from roerich.scores.mmd import maximum_mean_discrepancy +from roerich.scores.energy import energy_distance -class ScoreCalculator(ChangePointDetectionBase): +class SlidingWindows(ChangePointDetectionBase): - def __init__(self, metric=None, func=None, periods=1, window_size=100, step=1, n_runs=1): + def __init__(self, metric=None, periods=1, window_size=100, step=1, n_runs=1): super().__init__(periods=periods, window_size=window_size, step=step, n_runs=n_runs) self.metric = metric - self.func = func + + """ + Change point detection algorithm based on binary classification. + + Parameters: + ----------- + periods: int, default=1 + Number of consecutive observations of a time series, considered as one input vector. + The signal is considered as an autoregression process (AR) for classification. In the most cases periods=1 + will be a good choice. + + window_size: int, default=100 + Number of consecutive observations of a time series in test and reference + windows. Recommendation: select the value so that there is only one change point within 2*window_size + observations of the signal. + + step: int, default=1 + Algorithm estimates change point detection score for each observation. step > 1 helps + to speed up the algorithm. + + n_runs: int, default=1 + Number of times, the binary classifier runs on each pair of test and reference + windows. Observations in the windows are divided randomly between train and validation sample every time. + n_runs > 1 helps to reduce noise in the change point detection score. + + metric: str/function, default=None + Function that gives the measure of dissimilarity between data points in windows. + Metric should be one of: EnergyDist, FrechetDist, MaxMeanDisc; or a function should be passed. + Function must be in the following format: + + Parameters: + ----------- + X_ref: numpy.ndarray + Matrix of reference observations. + X_test: numpy.ndarray + Matrix of test observations. + + Returns: + -------- + score: float + Estimated change point detection score for a pair of window. + + """ def reference_test_predict(self, X_ref, X_test): - if self.metric == "EnergyDist": - n = X_ref.shape[0] - E = 2*pairwise_distances(X_ref, X_test, metric='euclidean') - pairwise_distances(X_test, metric='euclidean') - pairwise_distances(X_ref, metric='euclidean') - return np.sum(E) / n ** 2 - elif self.metric == "FrechetDist": + if self.metric == "energy": + return energy_distance(X_ref, X_test) + elif self.metric == "fd": return frechet_distance(X_ref, X_test) - elif self.metric == "MaxMeanDisc": + elif self.metric == "mmd": return maximum_mean_discrepancy(X_ref, X_test) - elif self.func is not None: - return self.func(X_ref, X_test) + elif callable(self.metric): + return self.metric(X_ref, X_test) else: raise ValueError("metric should be one of: EnergyDist, FrechetDist, MaxMeanDisc; or a function should be " "passed") diff --git a/roerich/algorithms/cpdc.py b/roerich/algorithms/cpdc.py index 3320c7a..435ac7d 100644 --- a/roerich/algorithms/cpdc.py +++ b/roerich/algorithms/cpdc.py @@ -115,7 +115,7 @@ def reference_test_predict(self, X_ref, X_test): X_test: numpy.ndarray Matrix of test observations. - Retunrs: + Returns: -------- score: float Estimated change point detection score for a pair of window. diff --git a/roerich/change_point/__init__.py b/roerich/change_point/__init__.py index b8f5231..b56c3bf 100644 --- a/roerich/change_point/__init__.py +++ b/roerich/change_point/__init__.py @@ -2,7 +2,7 @@ from ..algorithms.cpdc import ChangePointDetectionClassifier, ChangePointDetectionRuLSIF from ..algorithms.cpdc_cv import ChangePointDetectionClassifierCV from ..algorithms.enrg_dist import EnergyDistanceCalculator -from ..algorithms.calc_metrics import ScoreCalculator +from ..algorithms.calc_metrics import SlidingWindows __all__ = [ @@ -12,7 +12,7 @@ 'ChangePointDetectionRuLSIF', 'ChangePointDetectionClassifierCV', 'EnergyDistanceCalculator', - 'ScoreCalculator' + 'SlidingWindows' ] diff --git a/roerich/scores/__init__.py b/roerich/scores/__init__.py index c767f61..300043e 100644 --- a/roerich/scores/__init__.py +++ b/roerich/scores/__init__.py @@ -1,10 +1,11 @@ from .mmd import maximum_mean_discrepancy from .fd import frechet_distance - +from .energy import energy_distance __all__ = [ 'maximum_mean_discrepancy', - 'frechet_distance' + 'frechet_distance', + 'energy_distance' ] diff --git a/roerich/scores/energy.py b/roerich/scores/energy.py new file mode 100644 index 0000000..472a13b --- /dev/null +++ b/roerich/scores/energy.py @@ -0,0 +1,10 @@ +import numpy as np +from sklearn.metrics import pairwise_distances + + +def energy_distance(x, y): + n = x.shape[0] + e = 2*pairwise_distances(x, y, metric='euclidean')\ + - pairwise_distances(y, metric='euclidean') \ + - pairwise_distances(x, metric='euclidean') + return np.sum(e) / n ** 2