-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMBA.py
99 lines (77 loc) · 3.54 KB
/
MBA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import seaborn
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
import matplotlib.pyplot as plt
from config import output_directory
from helper_scripts.changes_helper import get_changes
from helper_scripts.file_pair_helper import add_info_to_cochanges
def perform_mba():
components = get_changes()
# group names by version
grouped_comp = components.groupby('version')['name'].apply(list).reset_index(name='shoppingList')
components_list = grouped_comp['shoppingList']
# encode list of transactions to a dataframe
te = TransactionEncoder()
te_ary = te.fit(components_list).transform(components_list)
df = pd.DataFrame(te_ary, columns=te.columns_)
# perform mba
rules = generate_basket_rules(df)
rules['file1'] = list(map(lambda x: next(iter(x)), rules['antecedents']))
rules['file2'] = list(map(lambda x: next(iter(x)), rules['consequents']))
return rules, components[['name', 'package']]
def generate_basket_rules(df):
all_itemsets = apriori(df, min_support=0.0000001, use_colnames=True, max_len=2)
all_itemsets.to_pickle(output_directory + "/mba_support_0.pkl")
supp0 = sorted(all_itemsets['support'])
print("----threshold results mba support----")
print_quartiles(supp0)
rules = association_rules(all_itemsets, metric="confidence", min_threshold=0.0)
rules.to_pickle(output_directory + "/mba_conf_0_supp_0.pkl")
conf_0_supp0 = sorted(rules['confidence'])
print("----threshold results mba confidence without support threshold----")
print_quartiles(conf_0_supp0)
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True, max_len=2)
frequent_itemsets.to_pickle(output_directory + "/mba_support_2.pkl")
supp2 = sorted(frequent_itemsets['support'])
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.0)
rules.to_pickle(output_directory + "/mba_conf_0_supp_2.pkl")
conf_0_supp2 = sorted(rules['confidence'])
#TODO: extract to proper function
#seaborn.violinplot(data=supp0)
#plt.show()
#seaborn.violinplot(data=conf_0_supp0)
#plt.show()
#seaborn.violinplot(data=supp2)
#plt.show()
#seaborn.violinplot(data=conf_0_supp2)
#plt.show()
print("----threshold results mba confidence after support threshold----")
print_quartiles(conf_0_supp2)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules.to_pickle(output_directory + "/mba_conf_8.pkl")
return rules
def print_quartiles(arr):
if len(arr) == 0:
print("Quartiles could not be calculated. array is empty.")
else:
print("quartile values:")
first_quartile = np.percentile(arr, 25)
median = np.percentile(arr, 50)
third_quartile = np.percentile(arr, 75)
print("10% at threshold: ", np.percentile(arr, 10))
print("1% at threshold: ", np.percentile(arr, 1))
print("90% at threshold: ", np.percentile(arr, 90))
print("95% at threshold: ", np.percentile(arr, 95))
print(first_quartile, median, third_quartile)
def generate_mba_analysis_files():
# Create the directory
if not os.path.exists(output_directory):
os.makedirs(output_directory)
rules, changed_files = perform_mba()
# 1) Build the dataframe containing the co-changes
rules_with_dates = add_info_to_cochanges(rules, changed_files)
rules_with_dates.to_csv(output_directory + "/mba.csv")