forked from rubenscheedler-study/CoChangeAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dynamic_Warp.py
77 lines (61 loc) · 2.64 KB
/
dynamic_Warp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
from collections import namedtuple
from operator import attrgetter
import seaborn
from dtw import dtw
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import config
from MBA import print_quartiles
from config import output_directory
from helper_scripts.Commit_date_helper import convert_hashlist_to_datelist
from helper_scripts.changes_helper import get_changes
from helper_scripts.file_pair_helper import add_info_to_cochanges
def perform_dtw():
components = get_changes()
# group versions by name
grouped_comp = components.groupby('name')['version'].apply(list).reset_index(name='changeVersions')
# generate list of change dates from versions
grouped_comp['changeMoments'] = list(map(convert_hashlist_to_datelist, grouped_comp['changeVersions']))
Distance = namedtuple('Distance', 'x y dist')
return_list = []
distance_list = []
# iterate over rows
for x in grouped_comp.itertuples():
# drop rows we already had
for y in grouped_comp.drop(grouped_comp.index[:x.Index + 1]).itertuples():
norm_distance = generate_dtw(x.changeMoments, y.changeMoments)
distance_list.append(Distance(x.name, y.name, norm_distance))
distance_list = sorted(distance_list, key=attrgetter('dist'))
distance_df = pd.DataFrame(distance_list)
threshold = distance_df.quantile(0.01)[0]
for dist in distance_list:
if dist.dist < threshold and (dist.x is "package-info.java" or dist.y is "package-info.java"):
return_list.append((dist.x, dist.y))
distance_df.to_csv(output_directory + "/dtw_distances.csv", index_label=False)
made_threshold = len(return_list)/len(distance_list)
print(made_threshold)
print("----threshold results DTW----")
print("quartile values:")
distances = list(map(lambda x: x.dist, distance_list))
print_quartiles(distances)
warpdf = pd.DataFrame(return_list, columns=['file1', 'file2'])
return warpdf, components[['name', 'package']]
def visualise_dtw_distances(distance_list):
ax = seaborn.violinplot(data=distance_list)
ax.set_ylabel('distance in seconds')
ax.set_xlabel(config.project_name)
plt.show()
def generate_dtw(x, y):
dynamic_warp = dtw(x=x, y=y)
return dynamic_warp.normalizedDistance
def generate_dtw_analysis_files():
# Create the directory
if not os.path.exists(output_directory):
os.makedirs(output_directory)
warps, changed_files = perform_dtw()
# Add package columns
warp_with_dates = add_info_to_cochanges(warps, changed_files)
# 3) Store results in files
warp_with_dates.to_csv(output_directory + "/dtw.csv", index_label=False)