From 8d49f82304f2e3b58dcc75de288d845de6c7e3d4 Mon Sep 17 00:00:00 2001
From: infer742 <christian.fiedler1@rwth-aachen.de>
Date: Mon, 16 Jan 2023 15:25:23 +0100
Subject: [PATCH] fixed problem with lsk

---
 pyinsights/log_skeleton/log_skeleton.py | 58 +++++++++++++++----------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/pyinsights/log_skeleton/log_skeleton.py b/pyinsights/log_skeleton/log_skeleton.py
index 83a1425..4521d27 100644
--- a/pyinsights/log_skeleton/log_skeleton.py
+++ b/pyinsights/log_skeleton/log_skeleton.py
@@ -44,7 +44,6 @@ def get_log_skeleton(self, noise_threshold=0):
         """
         log_skeleton = None
         # Get the extended log
-
         equivalence, always_after, always_before, never_together, directly_follows \
             = self._get_relations(noise_threshold)
 
@@ -179,13 +178,16 @@ def _get_equivalence(self, noise_threshold, case_id=None):
             occ_max = groups_expanded[max]
             occ_min = groups_expanded[min]
 
+            occurrences_max = len(groups_expanded[max])
+
             # get difference betwwen two profiles
+
             differences = occ_max.merge(
                 occ_min, indicator=True, how='left').loc[lambda x: x['_merge'] != 'both']
 
-            # if the two profiles deviate no more than noise * num_cases
+            # if the two profiles deviate no more than noise * size of larger profile
             # they are equivalent
-            if len(differences) <= num_cases*noise_threshold:
+            if len(differences) <= occurrences_max*noise_threshold:
                 equivalence.add(pair)
 
         return equivalence
@@ -244,16 +246,18 @@ def _get_always_after(self, noise_threshold, case_id=None):
             # fill nas so they don't screw up max
             pos_per_case.fillna(0, inplace=True)
 
+            # length of act1's profile
+            num = groups_expanded[pair[0]][case_col].nunique()
+
             # handling if both activities are the same
             if pair[0] == pair[1]:
                 # if they actually occur more than 1 time in every case --> they are in the relation
-                if np.sum(merged[case_col].value_counts() > 1) >= num_cases*(1-noise_threshold):
+                if np.sum(merged[case_col].value_counts() > 1) >= num*(1-noise_threshold):
                     always_after.add(pair)
             # else test if the last occurrence of act2 is after the last occurrence of act1
-            # in at least (1-noise) percent of cases
-            elif np.sum(pos_per_case["order_x"] <= pos_per_case["order_y"]) >= num_cases*(1-noise_threshold):
+            # in at least (1-noise) percent of times act1 occurs
+            elif np.sum(pos_per_case["order_x"] <= pos_per_case["order_y"]) >= num*(1-noise_threshold):
                 always_after.add(pair)
-
         return always_after
 
     def _get_always_before(self, noise_threshold, case_id=None):
@@ -306,15 +310,17 @@ def _get_always_before(self, noise_threshold, case_id=None):
             test = grouped.agg({'order_x': 'min',
                                 'order_y': 'min'})
 
+            # length of act1's profile
+            num = groups_expanded[pair[0]][case_col].nunique()
             # handling if both activities are the same
             if pair[0] == pair[1]:
                 # if they actually occur more than 1 time in every case --> they are in the relation
-                if np.sum(merged[case_col].value_counts() > 1) >= num_cases*(1-noise_threshold):
+                if np.sum(merged[case_col].value_counts() > 1) >= num*(1-noise_threshold):
                     always_before.add(pair)
 
             # else test if the first occurrence of act1 is after the first occurrence of act2
             # in more than (1-noise) times act1 occurs
-            elif np.sum(test["order_x"] >= test["order_y"]) >= num_cases*(1-noise_threshold):
+            elif np.sum(test["order_x"] >= test["order_y"]) >= num*(1-noise_threshold):
                 always_before.add(pair)
 
         return always_before
@@ -372,7 +378,7 @@ def _get_never_together(self, noise_threshold, case_id=None):
             num = len(occ_max)
             # check if smaller profile and larger profile do not occur together
             # more than num_cases * noise
-            if np.sum(occ_max.isin(occ_min)) <= num_cases*(noise_threshold):
+            if np.sum(occ_max.isin(occ_min)) <= num*(noise_threshold):
                 never_together.add(pair)
         return never_together
 
@@ -417,11 +423,8 @@ def _get_directly_follows(self, noise_threshold, case_id=None):
         i = 1
         bar = tqdm(total=iterations)
         bar.set_description("Calculating directly-follows")
-        for _, row in edge_table.iterrows():
-            directly_follows.add((row["SOURCE"], row["TARGET"]))
-
-            bar.update(1)
-            i += 1
+        directly_follows = set(
+            edge_table[["SOURCE", "TARGET"]].itertuples(name=None, index=False))
 
         return directly_follows
 
@@ -430,6 +433,8 @@ def get_non_conforming_cases(self, noise_threshold=0, cases_to_compare=None):
         Checks for each trace in the log, whether it is fitting or not.
         :return: dataframe with ids of non-conforming cases
         """
+        events = self.connector.events()
+        num_activities = events[act_col].nunique()
         # get lsk
         lsk = self.get_log_skeleton(noise_threshold)
         # get lsk per trace
@@ -439,21 +444,21 @@ def get_non_conforming_cases(self, noise_threshold=0, cases_to_compare=None):
         # check for each case if relation is subset of lsk
 
         non_conforming = {case for relation in lsk_compare_traces.keys(
-        ) for case in lsk_compare_traces[relation].keys() if not self._conforms(lsk_compare_traces, relation, case, lsk)}
+        ) for case in lsk_compare_traces[relation].keys() if not self._conforms(lsk_compare_traces, relation, case, lsk, noise_threshold, num_activities)}
 
         # return non-conforming cases as df
         df = pd.DataFrame(columns=[case_col], data=non_conforming)
 
         return df
 
-    def _conforms(self, lsk_traces, relation, case, lsk):
+    def _conforms(self, lsk_traces, relation, case, lsk, noise_threshold, num_activities):
         """checks if relation of trace conforms to lsk
+            relaxed problem: the difference between the relations can have up to noise-threshold * number of pairs elements, and still conform
         Args:
             lsk_traces (_type_): _description_
             relation (_type_): _description_
             case (_type_): _description_
             lsk (_type_): _description_
-
         Returns:
             bool: conformity
         """
@@ -461,8 +466,17 @@ def _conforms(self, lsk_traces, relation, case, lsk):
             for act in lsk[relation].keys():
                 if not lsk_traces[relation][case][act].issubset(lsk[relation][act]):
                     return False
+
+        elif relation not in ['equivalence', 'never_together']:
+            num_pairs = num_activities**2
+            difference = lsk_traces[relation][case].difference(lsk[relation])
+            if len(difference) > num_pairs*noise_threshold:
+                return False
         else:
-            if not lsk_traces[relation][case].issubset(lsk[relation]):
+            num_pairs = math.factorial(
+                num_activities) / math.factorial(num_activities-2)
+            difference = lsk_traces[relation][case].difference(lsk[relation])
+            if len(difference) > num_pairs*noise_threshold:
                 return False
         return True
 
@@ -883,9 +897,7 @@ def _get_directly_follows_per_case(self, case_id=None):
 
         bar = tqdm(total=iterations)
         bar.set_description("Calculating directly- for cases")
-        for _, row in edge_table.iterrows():
-            df_all_cases[row[case_col]].add((row["SOURCE"], row["TARGET"]))
-
-            bar.update(1)
+        df_all_cases = {case_col: set((source, target)) for (
+            case_col, source, target) in tqdm(edge_table[[case_col, "SOURCE", "TARGET"]].itertuples(name=None, index=False))}
 
         return df_all_cases