generated from streamlit/streamlit-hello
-
Notifications
You must be signed in to change notification settings - Fork 2
/
symbolization.py
1019 lines (891 loc) · 41.4 KB
/
symbolization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import random
import numpy as np
import pandas as pd
from joblib import cpu_count
from scipy.spatial.distance import pdist, squareform
from scipy.stats import norm
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.utils import Bunch
class Symbolization(BaseEstimator):
"""
Attributing a symbol per segment.
Inputs the computed features per segment, there can be more than one
feature.
Outputs the symbolic signals (multivariate or univariate) as well as the
(unique or several) look-up table which is the pairwise distance matrix
between all the individual symbols.
Amounts to discretizing our signals along the y-axis.
Parameters
----------
n_symbols : int, default=5
Number of possible unique symbols for our symbolic representation.
Corresponds to the size of alphabet in the case of classic SAX.
Must be inferior or equal to 26 which is the size of the English
alphabet, e.g. the number of possible letters. Indeed, if we
want to use the weighted Levenshtein distance on our symbolic
signals, we will convert our integer symbols into letter symbols.
symb_method : {'quantif', 'cluster'}, default='quantif'
Family of method for the symbolization. Possible values:
- 'quantif' : symbolization using quantification along
the y-axis.
- 'cluster' : symbolization through clustering of the computed
features per segment.
symb_quantif_method : {'gaussian', 'quantiles', None}, default='gaussian'
Quantification method for the symbolization when `symb_method` is 'quantif'.
Possible values:
- 'gaussian' : quantification using Gaussian breakpoints.
In this case, the symbolization is the same as performed in
vanilla SAX.
- 'quantiles' : quantification using quantiles as breakpoints.
- None : `symb_mehtod` is not `quantif`.
symb_cluster_method : {'kmeans', 'kmeans_partial', 'minibatch_kmeans',
'spectral_kmeans', 'spectral_discretize', 'spectral_cluster_qr',
None}, default='None'
Clustering method for the symbolization when `symb_method` is 'cluster'.
Possible values:
- 'kmeans' : regular K-means.
- 'kmeans_partial' : K-means using 20% of the input size.
- 'minibatch_kmeans' : mini-batch K-means using 20% of the input size.
- 'spectral_kmeans' : spectral clustering with `kmeans` to assign labels
- 'spectral_discretize' : spectral clustering with `discretize` to assign labels
- 'spectral_cluster_qr' : spectral clustering with `cluster_qr` to assign labels
- None : `symb_method` is not 'cluster'.
features_scaling : dict, default='None'
For symbolization using clustering, multiplicative coefficient in front
a feature (after z-normalization).
numerosity_reduction : bool, default=False
False when no numerosity reduction is applied (before eventual
reconstruction).
reconstruct_bool : bool, default=False
False when no reconstruction of our symbolic signals is done.
When doing uniform segmentation, no need for reconstruction.
When doing adaptive segmentation, we recommend using reconstruction.
n_regime_lengths : int or list or str or None, default=None
When the type is int, `n_regime_lengths` is the number of possible
unique regime length values when performing reconstruction of
our symbolic signals.
Amounts to discretizing or quantifying the regime lengths (segment
lengths).
The quantified regime lengths are divided by their minimum then
rounded.
When the type is list, it must be of length `n_symbols`, there is a
specific value of `n_regime_lengths` per symbol.
Hence, the quantified regime lengths values are different from a
symbol to another.
Note that we must get our segment symbols first.
The quantified regime lengths (of all symbols) are divided by
their minimum then rounded.
If `n_regime_lengths` is `'ccl'`: when doing symbolization with
clustering on features including the segment length, the quantified
segment lengths are the scaled rounded corresponding coordinates
of the cluster centers.
If `n_regime_lengths` is `'divide_exact'`: when doing adaptive
multivariate segmentation, divide the `n_segments` segment lengths
by their minimum.
If None and we are performing reconstruction of our symbolic
signals, then no quantification of the regime lengths is done.
When doing adaptive segmentation and reconstruction, we recommend
quantifying the regime lengths for memory purposes (and not for
performance purposes).
seglen_bins_method : {'linspace', 'quantiles', None}, default=None
Method to obtain the segment bins `self.seglen_bins_` for the
quantification of the regime lengths.
Possible values:
- 'linspace' : obtaining the `self.seglen_bins_` with evenly spaced
bins over the minimum and maximum of segment lengths.
- 'quantiles' : obtaining the `self.seglen_bins_` using quantiles.
- None : if `n_regime_lengths` is `None`
lookup_table_type : {'mindist', 'mof', 'eucl_cc', 'eucl_ccm', None}, default='mindist'
Type of distance between pairwise individual symbols which will be
used to build the look-up table. Possible values:
- 'mindist' : MINDIST as defined in Vanilla SAX (we assume that we only
have one feature per segment).
- 'mof' : mean of feature (we assume that we only have one feature per
segment).
The distance between two symbols is the distance between the mean
of all the values of the feature for a same symbol.
- 'eucl_cc' : only when the symbolization method is clustering.
The distance between two symbols is the euclidean distance between
their corresponding cluster centers.
- 'eucl_ccm' : only when the symbolization method is clustering.
The distance between two symbols is the euclidean distance between
their corresponding cluster centers whose coordinates have
been truncated to the mean feature only.
Attributes
----------
scaler_ : fitted scikit-learn model (sklearn.preprocessing.StandardScaler)
When the symbolization is done through clustering of the features
per segment, we first need to scale our data.
clustering_model_ : fitted scikit-learn model (e.g. sklearn.cluster.KMeans)
When the symbolization is done through clustering.
scaled_cluster_centers_df_
unscaled_cluster_centers_df_
lookup_table_ : ndarray of shape (n_symbols, n_symbols)
Pairwise distance matrix between all the individual symbols.
y_quantif_bins_ : list of length (n_symbols-1)
When the symbolization method is quantification: bins for the
quantification of the unique feature.
seglen_bins_ : (list of) list of length (n_regime_lengths-1)
When doing reconstruction and quantifying the segment lengths, bins
for the quantification of the obtained segment lengths.
from_seglen_label_to_value_dict_ : (list of) dict of length (n_regime_lengths)
When doing reconstruction and quantifying the segment lengths, mapping
the quantified segment lengths values to the mean of the real
segment lengths that got the same quantified value.
from_cluster_label_to_seglenquantif_dict_ : dict of length `n_symbols`
When doing symbolization with clustering on features including the
segment length.
The quantified regime lengths are the cluster centers coordinates
corresponding to the regime lengths.
The quantified regime lengths are divided by their minimum then rounded.
"""
def __init__(
self,
n_symbols: int = 5,
symb_method="quantif",
symb_quantif_method="gaussian",
symb_cluster_method=None,
features_scaling=None,
numerosity_reduction: bool = False,
reconstruct_bool: bool = False,
n_regime_lengths=None,
seglen_bins_method=None,
lookup_table_type="mindist",
) -> None:
# Unit tests on the parameters:
# err_msg = (
# f"`n_symbols` must be an integer lower than 26 because the "
# "alphabet has 26 letters, and not {n_symbols}."
# )
# assert type(n_symbols) == int and n_symbols <= 26, err_msg
err_msg = f"`n_symbols` must be an integer, not {n_symbols}."
assert type(n_symbols) == int, err_msg
err_msg = (
"`numerosity_reduction` must be a boolean, and not "
f"{numerosity_reduction}."
)
assert type(numerosity_reduction) == bool, err_msg
err_msg = (
"`reconstruct_bool` must be a boolean, and not "
f"{reconstruct_bool}."
)
assert type(reconstruct_bool) == bool, err_msg
err_msg = (
"Choose between quantification (`quantif`) or clustering "
f"(`cluster`), not {symb_method}."
)
assert symb_method in ["quantif", "cluster"], err_msg
if symb_method == "quantif":
err_msg = (
"If the symbolization is done with quantification, choose "
f"`gaussian` or `quantiles`, not {symb_quantif_method}."
)
assert symb_quantif_method in ["gaussian", "quantiles"], err_msg
err_msg = (
"If the symbolization is done with quantification, "
f"`symb_cluster_method` should be None, not {symb_cluster_method}."
)
assert symb_cluster_method is None, err_msg
err_msg = (
"If the symbolization is done with quantification, "
f"`features_scaling` should be None, not {features_scaling}."
)
assert features_scaling is None, err_msg
err_msg = (
"If the symbolization is done with quantification, choose "
f"`mindist` or `mof`, not {lookup_table_type}."
)
assert lookup_table_type in ["mindist", "mof"], err_msg
if symb_method == "cluster":
err_msg = (
"If the symbolization is done with clustering, choose "
"`kmeans` or `kmeans_partial` or `minibatch_kmeans`, "
f"not {symb_cluster_method}."
)
assert symb_cluster_method in [
"kmeans",
"kmeans_partial",
"minibatch_kmeans",
"spectral_kmeans",
"spectral_discretize",
"spectral_cluster_qr",
], err_msg
err_msg = (
"If the symbolization is done with clustering, "
f"`symb_quantif_method` should be None, not {symb_quantif_method}."
)
assert symb_quantif_method is None, err_msg
if symb_cluster_method in [
"kmeans",
"kmeans_partial",
"minibatch_kmeans",
]:
err_msg = (
"If the symbolization is done with K-means clustering, choose "
f"`eucl_cc` or `eucl_ccm`, not {lookup_table_type}."
)
assert lookup_table_type in ["eucl_cc", "eucl_ccm"], err_msg
if features_scaling is not None:
err_msg = (
"If the symbolization is done with clustering, "
"and `features_scaling` is not `None`, it must be a "
"dictionary."
)
assert type(features_scaling) == dict, err_msg
if type(features_scaling) == dict:
for key in features_scaling:
err_msg = (
"If the symbolization is done with clustering, "
"the keys of `features_scaling` must be strings."
)
assert type(key) == str, err_msg
err_msg = (
"If the symbolization is done with clustering, "
"the values of `features_scaling` must be positive "
"floats."
)
assert features_scaling[key] >= 0, err_msg
if not reconstruct_bool:
err_msg = (
f"If there is no reconstruction, `n_regime_lengths` should be "
f"`None`, not {n_regime_lengths}."
)
assert n_regime_lengths is None, err_msg
err_msg = (
f"If there is no reconstruction, `seglen_bins_method` should be "
f"`None`, not {seglen_bins_method}."
)
assert seglen_bins_method is None, err_msg
else: # there is reconstruction
# err_msg = (
# "If there is reconstruction, `n_regime_lengths` should not be "
# "`None`"
# )
# assert n_regime_lengths is not None, err_msg
if type(n_regime_lengths) == list:
err_msg = (
"`n_regime_lengths` must be a list of length `n_symbols`, "
f"and not {len(n_regime_lengths)}."
)
assert len(n_regime_lengths) == n_symbols, err_msg
err_msg = (
"If there is reconstruction, for `seglen_bins_method` choose "
f"`linspace` or `quantiles` or `None`, {seglen_bins_method}."
)
assert seglen_bins_method in [
"linspace",
"quantiles",
None,
], err_msg
# Initializing the parameters
self.n_symbols = n_symbols
self.symb_method = symb_method
self.symb_quantif_method = symb_quantif_method
self.symb_cluster_method = symb_cluster_method
self.features_scaling = features_scaling
self.numerosity_reduction = numerosity_reduction
self.reconstruct_bool = reconstruct_bool
self.n_regime_lengths = n_regime_lengths
self.seglen_bins_method = seglen_bins_method
self.lookup_table_type = lookup_table_type
self.scaler_ = None
self.clustering_model_ = None
self.scaled_cluster_centers_df_ = None
self.unscaled_cluster_centers_df_ = None
self.lookup_table_ = None
self.y_quantif_bins_ = None
self.seglen_bins_ = None
self.from_seglen_label_to_value_dict_ = None
self.from_cluster_label_to_seglenquantif_dict_ = None
def fit(self, segment_features_df: pd.DataFrame, *args, **kwargs):
# if `symb_method` is quantification, there are two possible ways to compute the
# lookup table: MINDIST type or mean of univariate feature (MoF).
if self.symb_method == "quantif":
err_msg = "Choose a lookup table type."
assert self.lookup_table_type is not None, err_msg
err_msg = f"Choose a valid lookup table type, not {self.lookup_table_type}."
assert self.lookup_table_type in ["mindist", "mof"], err_msg
# After unit testing, the proper fit occurs here:
if self.reconstruct_bool and self.n_regime_lengths is not None:
self.fit_quantif_seglen(segment_features_df=segment_features_df)
if self.symb_method == "quantif":
return self.fit_quantif(segment_features_df=segment_features_df)
elif self.symb_method == "cluster":
return self.fit_clustering(segment_features_df=segment_features_df)
def fit_quantif_seglen(self, segment_features_df: pd.DataFrame):
"""Fit the segment lengths' quantification step (limits of bins and
seglen labels).
Make sure that `segment_features_df` is a pd.DataFrame.
"""
err_msg = (
"`segment_features_df` is a pd.DataFrame, "
f"not {type(segment_features_df)}."
)
assert type(segment_features_df) == pd.DataFrame, err_msg
if type(self.n_regime_lengths) == int:
b_get_quantif_seglen = self.get_quantif_seglen(
segment_lengths=segment_features_df.segment_length,
n_regime_lengths=self.n_regime_lengths,
seglen_bins_method=self.seglen_bins_method,
)
self.seglen_bins_ = b_get_quantif_seglen.seglen_bins
self.from_seglen_label_to_value_dict_ = (
b_get_quantif_seglen.from_seglen_label_to_value_dict
)
elif type(self.n_regime_lengths) == list:
# To be filled after transformation, once we have the segment symbols.
self.seglen_bins_ = dict()
self.from_seglen_label_to_value_dict_ = dict()
return self
@staticmethod
def get_quantif_seglen(
segment_lengths: pd.Series,
n_regime_lengths: int,
seglen_bins_method: str,
):
"""Get the segment lengths' quantification step (limits of bins and
seglen labels).
"""
err_msg = f"`n_regime_lengths` must be an integer, not {type(n_regime_lengths)}."
assert type(n_regime_lengths) == int, err_msg
err_msg = (
"`segment_lengths` is a pd.Series or pd.DataFrame, "
f"not {type(segment_lengths)}."
)
assert type(segment_lengths) is pd.Series or pd.DataFrame, err_msg
# Get the bins
if seglen_bins_method == "linspace":
seglen_bins = np.linspace(
start=min(segment_lengths),
stop=max(segment_lengths),
num=n_regime_lengths,
endpoint=False,
)[1:]
elif seglen_bins_method == "quantiles":
quantiles = np.linspace(
start=0, stop=1, num=n_regime_lengths + 1, endpoint=True
)
seglen_bins = (
pd.Series(segment_lengths)
.quantile(quantiles)
.round(0)
.astype(int)
.values[1:-1]
.flatten()
)
else:
err_msg = "`seglen_bins_method` is not well defined."
assert False, err_msg
# Get the bin label for the quantified segment lengths
labeled_segment_lengths = np.digitize(
segment_lengths,
bins=seglen_bins,
)
# Associate each segment length bin with the median value found in the
# training set.
from_seglen_label_to_value_dict = (
segment_lengths.groupby(labeled_segment_lengths)
.median()
.round(0)
.astype(int)
.to_dict()
)
b_get_quantif_seglen = Bunch(
seglen_bins=seglen_bins,
from_seglen_label_to_value_dict=from_seglen_label_to_value_dict,
)
return b_get_quantif_seglen
@staticmethod
def get_feat_df(segment_features_df: pd.DataFrame) -> pd.DataFrame:
"""Return the same df with only the feature columns."""
feat_columns = [
col for col in segment_features_df.columns if col.endswith("_feat")
]
return segment_features_df[feat_columns]
def fit_quantif(self, segment_features_df: pd.DataFrame):
"""Find the bins' limits for the quantification and compute the look-up
table.
This function assumes that there is only one feature.
"""
# Retrieve features
only_features_df = self.get_feat_df(
segment_features_df=segment_features_df
)
err_msg = (
"There are more than one feature; not possible with symbolization "
"using quantification."
)
assert only_features_df.shape[1] == 1, err_msg
# Get bins' limits
if self.symb_quantif_method == "gaussian":
self.y_quantif_bins_ = norm.ppf(
[float(i) / self.n_symbols for i in range(1, self.n_symbols)],
scale=1,
)
elif self.symb_quantif_method == "quantiles":
quantiles = np.linspace(
start=0, stop=1, num=self.n_symbols + 1, endpoint=True
)
self.y_quantif_bins_ = (
only_features_df.quantile(quantiles).values[1:-1].flatten()
)
# Compute look-up table
if self.lookup_table_type == "mindist":
self.lookup_table_ = self.compute_lookup_table_mindist(
y_quantif_bins=self.y_quantif_bins_
)
elif self.lookup_table_type == "mof":
segment_symbols = self.transform_quantif(
segment_features_df=only_features_df
)
feature_1D = only_features_df.values
self.lookup_table_ = self.compute_lookup_table_mof(
segment_symbols=segment_symbols, feature_1D=feature_1D
)
return self
def fit_clustering(self, segment_features_df: pd.DataFrame):
# Retrieve features
only_features_df = self.get_feat_df(
segment_features_df=segment_features_df
)
# Scaling:
self.scaler_ = StandardScaler().fit(only_features_df)
scaled_features = self.scaler_.transform(only_features_df)
scaled_features_df = pd.DataFrame(
scaled_features, columns=self.scaler_.feature_names_in_
)
# NEW for SAX-DD-ML-v3
if self.features_scaling is not None:
scaled_features_df["length_feat"] = (
scaled_features_df["length_feat"]
* self.features_scaling["length_feat"]
)
# Fit the clustering model:
batch_size_clustering = int(round(0.2 * len(scaled_features_df), 0))
if self.symb_cluster_method == "kmeans":
self.clustering_model_ = KMeans(
n_clusters=self.n_symbols,
init="k-means++",
n_init=10,
random_state=0,
).fit(scaled_features_df)
elif self.symb_cluster_method == "kmeans_partial":
scaled_features_shuffled_df = scaled_features_df.copy()
random.seed(0)
np.random.shuffle(scaled_features_shuffled_df)
self.clustering_model_ = KMeans(
n_clusters=self.n_symbols,
init="k-means++",
n_init=10,
random_state=0,
).fit(scaled_features_shuffled_df[0:batch_size_clustering, :])
elif self.symb_cluster_method == "minibatch_kmeans":
self.clustering_model_ = MiniBatchKMeans(
init="k-means++",
n_clusters=self.n_symbols,
batch_size=batch_size_clustering,
n_init=10,
max_no_improvement=10,
verbose=0,
random_state=0,
).fit(scaled_features_df)
elif self.symb_cluster_method in [
"spectral_kmeans",
"spectral_discretize",
"spectral_cluster_qr",
]:
if self.symb_cluster_method == "spectral_kmeans":
assign_labels = "kmeans"
elif self.symb_cluster_method == "spectral_discretize":
assign_labels = "discretize"
elif self.symb_cluster_method == "spectral_cluster_qr":
assign_labels = "cluster_qr"
self.clustering_model_ = SpectralClustering(
n_clusters=self.n_symbols,
assign_labels=assign_labels,
random_state=0,
).fit(scaled_features_df)
# Get the cluster centers (only if K-means variants), scaled or unscaled
# NEW for SAX-DD-ML-v3
if self.symb_cluster_method in [
"kmeans",
"kmeans_partial",
"minibatch_kmeans",
]:
if self.features_scaling is not None:
# The scaling coefficient was only needed to obtain the clusters,
# but let's go back to cluster centers without the coeff
scaled_features_df_new = scaled_features_df.copy()
scaled_features_df_new.length_feat = (
scaled_features_df.length_feat
/ self.features_scaling["length_feat"]
)
scaled_features_df_new[
"segment_symbol"
] = self.clustering_model_.labels_
scaled_cluster_centers = (
scaled_features_df_new.groupby("segment_symbol")
.mean()
.reset_index()
.sort_values(by="segment_symbol")
.drop(columns=["segment_symbol"])
)
# scaled_cluster_centers = self.clustering_model_.cluster_centers_
else:
scaled_cluster_centers = self.clustering_model_.cluster_centers_
self.scaled_cluster_centers_df_ = pd.DataFrame(
scaled_cluster_centers,
columns=self.clustering_model_.feature_names_in_,
)
unscaled_cluster_centers = self.scaler_.inverse_transform(
scaled_cluster_centers
)
self.unscaled_cluster_centers_df_ = pd.DataFrame(
unscaled_cluster_centers,
columns=self.clustering_model_.feature_names_in_,
)
# Compute the look-up table (and eventually the quantified regime lengths):
# TODO: maybe unscale the cluster centers value
if self.lookup_table_type == "eucl_cc":
self.lookup_table_ = squareform(pdist(scaled_cluster_centers))
elif self.lookup_table_type == "eucl_ccm":
# Careful: the cluster centers are unscaled here
self.lookup_table_ = squareform(
pdist(
self.unscaled_cluster_centers_df_["mean_feat"]
.to_numpy()
.reshape(-1, 1)
)
)
# Quantification of the segment lengths
self.from_cluster_label_to_seglenquantif_dict_ = dict()
l_quantif_len = self.unscaled_cluster_centers_df_[
"length_feat"
].tolist()
l_scl_quantif_len = [
elem / min(l_quantif_len) for elem in l_quantif_len
]
for i, length in enumerate(l_scl_quantif_len):
self.from_cluster_label_to_seglenquantif_dict_[i] = round(
length
)
return self
def transform(self, segment_features_df: pd.DataFrame):
# Transform to symbols and get them
if self.symb_method == "quantif":
segment_symbols = self.transform_quantif(
segment_features_df=segment_features_df
)
if self.symb_method == "cluster":
segment_symbols = self.transform_clustering(
segment_features_df=segment_features_df
)
features_with_symbols_df = segment_features_df.assign(
segment_symbol=segment_symbols
).sort_values(["signal_index", "segment_start"])
# Without numerosity reduction and without quantification of the
# segment lengths (and without reconstruction of course)
_features_with_symbols_nonumreduc_noquantifseglen_df = (
features_with_symbols_df.copy()
)
# Numerosity reduction (or not)
if self.numerosity_reduction:
features_with_symbols_df = self.transform_numerosity_reduction(
features_with_symbols_df=features_with_symbols_df
)
# With (eventual) numerosity reduction and without quantification of the
# segment lengths (and without reconstruction of course)
_features_with_symbols_noquantifseglen_df = (
features_with_symbols_df.copy()
)
# Reconstruction (or not)
if self.reconstruct_bool: # reconstruction
# Quantification of the regime lengths (or not)
if self.n_regime_lengths is not None:
# Replacing (inplace) the `segment_lengths` column by the
# quantified version.
if self.n_regime_lengths == "ccl":
features_with_symbols_df.segment_length = (
features_with_symbols_df.segment_symbol.astype(int).map(
self.from_cluster_label_to_seglenquantif_dict_
)
)
elif type(self.n_regime_lengths) == int:
features_with_symbols_df.segment_length = self.transform_quantif_seglen(
segment_lengths=features_with_symbols_df.segment_length
)
elif type(self.n_regime_lengths) == list:
err_msg = "`segment_symbol` must be a feature"
assert (
"segment_symbol" in features_with_symbols_df.columns
), err_msg
# TODO: why do we need to initialize again, it is already
# done in the fit, which is weird
l_groups = list()
self.seglen_bins_ = dict()
self.from_seglen_label_to_value_dict_ = dict()
for (
segment_symbol,
group,
) in features_with_symbols_df.groupby(
by=["segment_symbol"]
):
b_get_quantif_seglen = self.get_quantif_seglen(
segment_lengths=group.segment_length,
n_regime_lengths=self.n_regime_lengths[
segment_symbol
],
seglen_bins_method=self.seglen_bins_method,
)
self.seglen_bins_[
segment_symbol
] = b_get_quantif_seglen.seglen_bins
self.from_seglen_label_to_value_dict_[
segment_symbol
] = b_get_quantif_seglen.from_seglen_label_to_value_dict
group.segment_length = self.apply_quantif_seglen(
segment_lengths=group.segment_length,
seglen_bins=self.seglen_bins_[segment_symbol],
from_seglen_label_to_value_dict=self.from_seglen_label_to_value_dict_[
segment_symbol
],
)
l_groups.append(group)
features_with_symbols_df = pd.concat(
l_groups, ignore_index=True
)
# Reduce the quantified segment lengths (to make the symbolic
# signals shorter)
features_with_symbols_df.segment_length = (
self.shorten_quantif_seglen(
features_with_symbols_df.segment_length
)
)
elif self.n_regime_lengths == "divide_exact":
# Reduce the quantified segment lengths (to make the symbolic
# signals shorter)
features_with_symbols_df.segment_length = (
self.shorten_quantif_seglen(
features_with_symbols_df.segment_length
)
)
# Performing the reconstruction (whether the segment lengths are
# quantified or not)
list_of_symbolic_signals = list()
for _, group in features_with_symbols_df.groupby("signal_index"):
list_of_symbolic_signals.append(
np.array(
group.segment_symbol.apply(lambda x: [x])
* group.segment_length.astype(int)
).sum()
)
else: # no reconstruction (staying in the reduced space)
list_of_symbolic_signals = (
features_with_symbols_df.groupby("signal_index")
.apply(lambda df: df.segment_symbol.to_numpy())
.tolist()
)
if self.lookup_table_type == "mof":
"Mean per feature, in the multivariate case."
df_mof = self.get_feat_df(
features_with_symbols_df.sort_values("segment_symbol")
.groupby("segment_symbol")
.mean()
)
# If a symbol is not used, consider its mean to be nan
df_mof_full = df_mof.reset_index().copy()
# Add a nan row if a symbol does not appear
expected_unique_symbols = list(np.arange(self.n_symbols))
obtained_unique_symbols = sorted(
df_mof_full.segment_symbol.unique().tolist()
)
if len(expected_unique_symbols) != len(obtained_unique_symbols):
for symbol in expected_unique_symbols:
if symbol not in obtained_unique_symbols:
# Create a row with only nan values
d_nan_row = {"segment_symbol": symbol}
for col in df_mof_full.columns.tolist():
d_nan_row[col] = np.nan
pd_nan_row = pd.Series(d_nan_row).to_frame().T
# Add the nan row
df_mof_full = pd.concat(
[df_mof_full, pd_nan_row], ignore_index=True
)
df_mof_full = df_mof_full.sort_values(
by="segment_symbol"
).set_index("segment_symbol")
# Compute the look-up table
np_mof_full = df_mof_full.to_numpy()
# If nan, than the distance between symbols is null
self.lookup_table_ = np.nan_to_num(
squareform(pdist(X=np_mof_full, metric="euclidean"))
)
# Sanity check on the lookup table
lookup_table_shape_obtained = self.lookup_table_.shape
lookup_table_shape_expected = (self.n_symbols, self.n_symbols)
err_msg = (
f"The look up table is of shape {lookup_table_shape_obtained}, "
f"instead of expected {lookup_table_shape_expected}"
)
assert (
lookup_table_shape_obtained == lookup_table_shape_expected
), err_msg
b_transform_symbolization = Bunch(
list_of_symbolic_signals=list_of_symbolic_signals,
lookup_table=self.lookup_table_,
_features_with_symbols_nonumreduc_noquantifseglen_df=_features_with_symbols_nonumreduc_noquantifseglen_df,
_features_with_symbols_noquantifseglen_df=_features_with_symbols_noquantifseglen_df,
_features_with_symbols_df=features_with_symbols_df,
)
return b_transform_symbolization
def transform_quantif(self, segment_features_df: pd.DataFrame):
"""Return the segment symbols using quantification."""
err_msg = "Run `.fit()` first."
assert self.y_quantif_bins_ is not None, err_msg
# Retrieve the features
features = self.get_feat_df(segment_features_df=segment_features_df)
# Get symbols
segment_symbols = np.digitize(x=features, bins=self.y_quantif_bins_)
return segment_symbols
def transform_clustering(self, segment_features_df: pd.DataFrame):
"""Return the segment symbols using clustering."""
err_msg = "Run `.fit()` first."
assert self.scaler_ is not None, err_msg
assert self.clustering_model_ is not None, err_msg
# Retrieve and scale the features
scaled_features = self.scaler_.transform(
self.get_feat_df(segment_features_df=segment_features_df)
)
scaled_features_df = pd.DataFrame(
scaled_features, columns=self.scaler_.feature_names_in_
)
# NEW
if self.features_scaling is not None:
scaled_features_df["length_feat"] = (
scaled_features_df["length_feat"]
* self.features_scaling["length_feat"]
)
# Getting the cluster labels per segment
if self.symb_cluster_method not in [
"spectral_kmeans",
"spectral_discretize",
"spectral_cluster_qr",
]:
segment_symbols = self.clustering_model_.predict(scaled_features_df)
else:
segment_symbols = self.clustering_model_.fit_predict(
scaled_features_df
)
return segment_symbols
def transform_quantif_seglen(self, segment_lengths: pd.Series):
"""Quantify a series of segment lengths and dividing them by their minimum.
When `type(self.n_regime_lengths) == int`.
"""
err_msg = "Run `.fit()` first."
assert self.from_seglen_label_to_value_dict_ is not None, err_msg
assert self.seglen_bins_ is not None, err_msg
# Get the quantified segment lengths
quantified_seglen = self.apply_quantif_seglen(
segment_lengths=segment_lengths,
seglen_bins=self.seglen_bins_,
from_seglen_label_to_value_dict=self.from_seglen_label_to_value_dict_,
)
# Reduce the quantified segment lengths (to make the symbolic
# signals shorter)
scaled_quantified_seglen = self.shorten_quantif_seglen(
quantified_seglen
)
return scaled_quantified_seglen
@staticmethod
def apply_quantif_seglen(
segment_lengths: pd.Series,
seglen_bins: list,
from_seglen_label_to_value_dict: dict,
):
"""Quantify a series of segment lengths (without dividing them by
their minimum).
"""
# Label each segment length
labeled_segment_lengths = np.digitize(
segment_lengths,
bins=seglen_bins,
)
# Apply the quantification dictionary on the segment length labels
quantified_seglen = np.vectorize(from_seglen_label_to_value_dict.get)(
labeled_segment_lengths
)
return quantified_seglen
@staticmethod
def shorten_quantif_seglen(quantified_seglen: pd.Series):
"""Reduce the segment lengths (to make the symbolic
sequences shorter)"""
min_quantified_seglen = quantified_seglen.min()
scaled_quantified_seglen = (
(quantified_seglen / min_quantified_seglen).round(0).astype(int)
)
return scaled_quantified_seglen
@staticmethod
def compute_lookup_table_mindist(y_quantif_bins) -> np.ndarray:
"""
Compute the lookup table which is called by the MINDIST function.
"""
n_symbols = len(y_quantif_bins) + 1
lookup_table = np.zeros((n_symbols, n_symbols))
for i_row in range(n_symbols):
for i_column in range(i_row + 2, n_symbols):
lookup_table[i_row, i_column] = (
y_quantif_bins[i_column - 1] - y_quantif_bins[i_row]
)
lookup_table += lookup_table.T # because the matrix is symmetric
return lookup_table
@staticmethod
def compute_lookup_table_mof(segment_symbols, feature_1D) -> np.ndarray:
"""
Compute the lookup table for the mean of {univariate feature
per segment}.
"""
df = pd.DataFrame(
{
"symbol": segment_symbols.flatten(),
"feature": feature_1D.flatten(),
}
)
# mean of feature (mof)
mof = df.sort_values("symbol").groupby("symbol").mean().to_numpy()
lookup_table = squareform(pdist(X=mof, metric="euclidean"))
return lookup_table
@staticmethod
def transform_numerosity_reduction(
features_with_symbols_df,
) -> pd.DataFrame:
""" "Apply numerosity reduction (fusion of segments)."""
l_index_rows_allsignals = list()
for _, group in features_with_symbols_df.groupby("signal_index"):
# For a signal, get the rows where we should merge adjacent
# segments because they have the same (redundant) symbol
group["segment_symbol_diff"] = (group["segment_symbol"] + 1).diff()
l_index_rows_signal = group.index[
group["segment_symbol_diff"] == 0
].tolist()
l_index_rows_allsignals.append(l_index_rows_signal)
# Update `segment_end` and `segment_length` for the segment to be
# merged
for index_row in sorted(l_index_rows_signal, reverse=True):
features_with_symbols_df.loc[