Skip to content

Commit

Permalink
Merge pull request #18 from HSE-LAMBDA/cvupd
Browse files Browse the repository at this point in the history
cv update
  • Loading branch information
hushchyn-mikhail committed Jul 3, 2023
2 parents 0118728 + a93c3ec commit cd9282b
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 64 deletions.
168 changes: 105 additions & 63 deletions roerich/algorithms/cpdc_cv.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,108 @@
from abc import ABCMeta, abstractmethod
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from copy import deepcopy
from joblib import Parallel, delayed
from scipy import interpolate
from scipy.signal import argrelmax

from .cpdc import ChangePointDetectionBase, KL, KL_sym, JSD, PE, PE_sym
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from .cpdc import ChangePointDetectionClassifier

# CV Classification
class ChangePointDetectionClassifierCV(ChangePointDetectionBase):
class ChangePointDetectionClassifierCV(ChangePointDetectionClassifier):

def __init__(self, base_classifier=QuadraticDiscriminantAnalysis(), metric="KL_sym",
def __init__(self, base_classifier='mlp', metric="klsym",
periods=1, window_size=100, step=1, n_runs=1, n_splits=5):

"""
Change point detection algorithm based on binary classififcation with cross validation.
Change point detection algorithm based on binary classification [1] with Stratified KFold cross validation.
It takes to sliding windows (reference and test) in a signal, and separate them using the classifier.
The classification quality is considered as a change point detection score.
[1] Mikhail Hushchyn and Andrey Ustyuzhanin. “Generalization of Change-Point Detection in Time Series Data Based on Direct Density Ratio Estimation.” J. Comput. Sci. 53 (2021): 101385.
Parameters:
-----------
base_classifier: object
Sklearn-like binary classifier.
metric: string
Name of the metric, that is used to measure the classifier quality and
considered as change point detection score. Default: "KL_sym".
periods: int
base_classifier: {'logreg', 'qda', 'dt', 'rf', 'mlp', 'knn', 'nb'} or callable, default='mlp'
Sklearn-like binary classifier to separate reference and test sliding windows in the signal.
- 'logreg', Logistic Regression classifier,
sklearn.linear_model.LogisticRegression()
- 'qda', Quadratic Discriminant Analysis classifier,
sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(store_covariance=True, reg_param=0.01)
- 'dt', Decision Tree classifier,
sklearn.tree.DecisionTreeClassifier(min_samples_leaf=10, max_depth=6)
- 'rf', Random Forest classifier,
sklearn.ensemble.RandomForestClassifier((n_estimators=100, min_samples_leaf=10))
- 'mlp', Multilayer Perceptron classifier,
sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100,100), solver="adam", activation="relu", learning_rate_init=0.1, max_iter=50, alpha=1.)
- 'knn', K Neighbors classifier,
sklearn.neighbors.KNeighborsClassifier(n_neighbors=10)
- 'nb', Gaussian Naive Bayes classifier,
sklearn.naive_bayes.GaussianNB(var_smoothing=0.01)
- Callable sklearn-like classifier,
Example: base_classifier = LogisticRegression()
metric: {'klsym', 'pesym', 'jsd', 'mmd', 'fd'} or callable, default='klsym'.
{'KL', 'PE'} will be deprecated in future versions.
Name of a score function, that is used to measure the classifier quality based on predictions
for reference (p_ref) and test (p_test) windows. It is considered as change point detection score.
- 'klsym' or 'KL_sym', symmetric Kullback-Leibler (KL) divergence,
KL(p_test||p_ref) + KL(p_ref||p_test)
- 'pesym' or 'PE_sym', symmetric Pearson (PE) divergence,
PE(p_test||p_ref) + PE(p_ref||p_test)
- 'jsd' or 'JSD', Jensen–Shannon divergence (JSD),
JSD(p_test||p_ref)
- 'mmd', the Maximum Mean Discrepancy (MMD),
MMD(p_test, p_ref)
- 'fd', the Frechet Distance (FD),
FD(p_test, p_ref)
- Callable function,
Example: metric = roerich.scores.frechet_distance
periods: int, default=1
Number of consecutive observations of a time series, considered as one input vector.
window_size: int
Number of consecutive observations of a time series in test and reference windows.
step: int
Algorithm estimates change point detection score for each <step> observation.
n_runs: int
Number of times, the binary classifier runs on each pair of test and reference windows.
n_splits: int
Number of splits for cross validation
The signal is considered as an autoregression process (AR) for classification. In the most cases periods=1
will be a good choice.
window_size: int, default=100
Number of consecutive observations of a time series in test and reference
windows. Recommendation: select the value so that there is only one change point within 2*window_size
observations of the signal.
step: int, default=1
Algorithm estimates change point detection score for each <step> observation. step > 1 helps
to speed up the algorithm.
n_runs: int, default=1
Number of times, the binary classifier runs on each pair of test and reference
windows. Observations in the windows are divided randomly between train and validation sample every time.
n_runs > 1 helps to reduce noise in the change point detection score.
n_splits: int, default=5
Number of splits for KFold cross validation.
"""

super().__init__(periods, window_size, step, n_runs)
self.base_classifier = base_classifier
self.metric = metric
super().__init__(base_classifier=base_classifier, metric=metric, periods=periods,
window_size=window_size, step=step, n_runs=n_runs)
self.n_splits = n_splits


def densratio(self, y, alpha=10**-3):
w = (y + alpha) / (1 - y + alpha)
return w



@ignore_warnings(category=ConvergenceWarning)
def reference_test_predict(self, X_ref, X_test):
"""
Estimate change point detection score for a pair of test and reference windows.
Expand All @@ -73,39 +126,28 @@ def reference_test_predict(self, X_ref, X_test):
y = np.hstack((y_ref, y_test))

kf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=np.random.randint(0, 1000))
scores = []
y_pred_glob = []
y_test_glob = []

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
X_train, X_test, y_train, y_test = (X[train_index],
X[test_index],
y[train_index],
y[test_index])


ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

classifier = deepcopy(self.base_classifier)
classifier.fit(X_train, y_train)
y_pred = classifier.predict_proba(X_test)[:, 1]

ref_preds = y_pred[y_test == 0]
test_preds = y_pred[y_test == 1]
ratios = self.densratio(y_pred)
ref_ratios = ratios[y_test == 0]
test_ratios = ratios[y_test == 1]

if self.metric == "KL":
score = KL(ref_preds, test_preds)
elif self.metric == "KL_sym":
score = KL_sym(ref_preds, test_preds)
elif self.metric == "JSD":
score = JSD(ref_preds, test_preds)
elif self.metric == "PE":
score = PE(ref_preds, test_preds)
elif self.metric == "PE_sym":
score = PE_sym(ref_preds, test_preds)
elif self.metric == "ROCAUC":
score = 2 * (roc_auc_score(y_test, y_pred) - 0.5)
else:
score = 0

scores.append(score)

return np.mean(scores)
y_pred_glob.append(y_pred)
y_test_glob.append(y_test)

y_pred = np.concatenate(tuple(y_pred_glob), axis=0)
y_test = np.concatenate(tuple(y_test_glob), axis=0)
score = self.metric(y_pred[y_test == 0], y_pred[y_test == 1])

return score
3 changes: 2 additions & 1 deletion tests/test_change_point.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def test_base(model):
cp = model()
score, cps_pred = cp.predict(X)

cpdc = [change_point.ChangePointDetectionClassifier]
cpdc = [change_point.ChangePointDetectionClassifier,
change_point.ChangePointDetectionClassifierCV]
@pytest.mark.parametrize("model", cpdc)
def test_base_classifier(model):
# generate time series
Expand Down

0 comments on commit cd9282b

Please sign in to comment.