diff --git a/merge_excavator/config.py b/merge_excavator/config.py index 67c97ea..66c3d3d 100644 --- a/merge_excavator/config.py +++ b/merge_excavator/config.py @@ -1,6 +1,6 @@ # Keys -GITHUB_KEY = '' +GITHUB_KEY = '0f06f1b13421972ae88b514ac1c680853f23cc7b' # Paths REPOSITORY_PATH = '../working_dir/repository/' @@ -10,7 +10,7 @@ QUERY_PATH = '../queries/' # DB information -DB_HOST = '' -DB_NAME = '' -DB_USER_NAME = '' -DB_PASSWORD = '' +DB_HOST = 'localhost' +DB_NAME = 'Merge_Data' +DB_USER_NAME = 'root' +DB_PASSWORD = '123' diff --git a/merge_excavator/data_convertion.py b/merge_excavator/data_convertion.py index 763b857..724e0f1 100644 --- a/merge_excavator/data_convertion.py +++ b/merge_excavator/data_convertion.py @@ -6,17 +6,21 @@ cd_to_csv = 'cd {};'.format(config.TEMP_CSV_PATH) table_list = ['Repository', - 'Merge_Replay', 'Conflicting_File', 'Conflicting_Region', 'Merge_Scenario', - 'Code_style_violation', + 'Merge_Replay', + 'Code_Style_Violation', 'Code_Complexity', 'Merge_Related_Commit'] -os.system('mysql -u {} -p < {}Merge_Data.sql'.format(config.DB_USER_NAME, config.QUERY_PATH)) +os.system('mysql -u {} < {}Merge_Data.sql'.format(config.DB_USER_NAME, config.QUERY_PATH)) +os.system(cd_to_csv + 'mkdir temp') for table in table_list: - os.system(cd_to_csv + 'cat {}_* > {}.csv'.format(table, table)) - os.system(cd_to_csv + 'mysqlimport --fields-terminated-by=, --verbose --local' - ' -u {} -p {} {}.csv'.format(config.DB_USER_NAME, config.DB_NAME, table)) + os.system(cd_to_csv + 'cat {}_* | tr -d "\r" > ./temp/{}.csv'.format(table, table)) + # os.system(cd_to_csv + 'mysql -u {} -e "USE {};LOAD DATA LOCAL INFILE \'./temp/{}.csv\' INTO TABLE {} FIELDS TERMINATED BY \',\' ENCLOSED BY \'\' LINES TERMINATED BY \'\n\' ;"'.format(config.DB_USER_NAME, config.DB_NAME, table, table)) + os.system(cd_to_csv + 'mysqlimport --fields-escaped-by='' --fields-terminated-by="," --lines-terminated-by="\n" --verbose --local -u root Merge_Data ./temp/{}.csv '.format(table)) +os.system(cd_to_csv + 'rm -r temp') + + diff --git a/merge_excavator/merge_replay.py b/merge_excavator/merge_replay.py index 3477bb9..6f912f0 100644 --- a/merge_excavator/merge_replay.py +++ b/merge_excavator/merge_replay.py @@ -32,7 +32,7 @@ def __init__(self): '\\@\\@\\@ \\-(\\d+),(\\d+) \\-(\\d+),(\\d+) \\+(\\d+),(\\d+) \\@\\@\\@[\\s\\S]*') def merge_replay(self, repository_name, merge_technique, merge_commit, parents_commit, exec_compile, exec_tests, - exec_conflicting_file, exec_conflicting_region, exec_replay_comparison): + exec_conflicting_file, exec_conflicting_region, exec_replay_comparison, repository_id): """ This method replay merges, and store the related information in tables. :param repository_name: The name of the repository in / format @@ -44,6 +44,7 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c :param exec_conflicting_file: Whether the information of the conflicting files should be stored :param exec_conflicting_region: Whether the information of the conflicting regions should be stored :param exec_replay_comparison: Whether the replay and merge commit should compare + :param repository_id: The GitHub id of repository :return: Nothing """ @@ -100,9 +101,9 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c # Store the merge replay information merge_replay_data = [merge_technique, is_conflict, replay_can_compile, replay_can_pass_test, execution_time, - replay_is_equal_to_merge_commit] + replay_is_equal_to_merge_commit, merge_commit, repository_id] csv_file = open(config.TEMP_CSV_PATH + 'Merge_Replay_{}.csv'.format(repository_name), 'a') - csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') + csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', lineterminator='\n') csv_writer.writerow(merge_replay_data) csv_file.close() @@ -129,7 +130,8 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c conflicting_file = rename_add_conflict_match.group(2) # Store the merge replay information - conflicting_file_data = [conflicting_file, conflict_type] + conflicting_file_data = [conflicting_file.strip(), conflict_type, merge_technique, merge_commit, + repository_id] csv_file = open(config.TEMP_CSV_PATH + 'Conflicting_File_{}.csv'.format(repository_name), 'a') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(conflicting_file_data) @@ -153,7 +155,7 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c # Store the conflicting region information conflicting_region_data = [parent1_path, parent2_path, diff_parent1_start, diff_parent1_length, - diff_parent2_start, diff_parent2_length] + diff_parent2_start, diff_parent2_length, merge_technique, merge_commit, repository_id] csv_file = open(config.TEMP_CSV_PATH + 'Conflicting_Region_{}.csv'.format(repository_name), 'a') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(conflicting_region_data) diff --git a/merge_excavator/merge_scenario_data.py b/merge_excavator/merge_scenario_data.py index 887bc53..7ea1807 100644 --- a/merge_excavator/merge_scenario_data.py +++ b/merge_excavator/merge_scenario_data.py @@ -1,6 +1,7 @@ import csv import time +import numpy import logging from dateutil.relativedelta import relativedelta as rd from time import gmtime, strftime @@ -31,7 +32,7 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec merge_commits = git_utility.get_merge_commits()[1:] #TODO: Why the first one is not in git log? # Repository Data - store_repository_info(repository_name) + repository_id = store_repository_info(repository_name) for merge_commit in merge_commits: @@ -85,7 +86,8 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec ancestor_can_compile, ancestor_can_pass_test, parent1_can_compile, parent1_can_pass_test, parent2_can_compile, parent2_can_pass_test, - merge_commit_date, ancestor_date, parent1_date, parent2_date, is_pull_request] + merge_commit_date, ancestor_date, parent1_date, parent2_date, is_pull_request, + repository_id] csv_file = open(config.TEMP_CSV_PATH + 'Merge_Scenario_{}.csv'.format(repository_name), 'a') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(merge_scenario_data) @@ -93,12 +95,13 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec # Merge replay merge_replay.merge_replay(repository_name, merge_technique, merge_commit, parents_commit, exec_compile, exec_tests, - exec_conflicting_file, exec_conflicting_region, exec_replay_comparison) + exec_conflicting_file, exec_conflicting_region, exec_replay_comparison, repository_id) # Store the related commits information if exec_related_commits: for index, parent in enumerate(parents_commit): - store_commit_info_between_two_commits(git_utility, ancestor_commit, parent, index + 1) + store_commit_info_between_two_commits(git_utility, ancestor_commit, parent, index + 1, + merge_commit, repository_id) # Store code style violation if exec_code_style_violation: @@ -107,15 +110,18 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec parent1_style_violations = get_code_violation_num(repository_name, parents_commit[0]) parent2_style_violations = get_code_violation_num(repository_name, parents_commit[1]) code_style_violation_data = [merge_commit_style_violations, ancestor_style_violations, - parent1_style_violations, parent2_style_violations] - csv_file = open(config.TEMP_CSV_PATH + 'Code_style_violation_{}.csv'.format(repository_name), 'a') + parent1_style_violations, parent2_style_violations, + merge_commit, repository_id] + csv_file = open(config.TEMP_CSV_PATH + 'Code_Style_Violation_{}.csv'.format(repository_name), 'a') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(code_style_violation_data) csv_file.close() # Store code complexity if exec_complexity: - code_complexity_data = get_code_complexity_diff(repository_name, parents_commit[0], parents_commit[1]) + code_complexity_data = get_code_complexity_diff(repository_name, parents_commit[0], parents_commit[1])\ + .tolist()\ + +[merge_commit, repository_id] csv_file = open(config.TEMP_CSV_PATH + 'Code_Complexity_{}.csv'.format(repository_name), 'a') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(code_complexity_data) diff --git a/merge_excavator/related_commits.py b/merge_excavator/related_commits.py index 289b8a1..cd5815b 100644 --- a/merge_excavator/related_commits.py +++ b/merge_excavator/related_commits.py @@ -1,11 +1,12 @@ import csv +import numpy as np from config import * from GitUtil import * -def store_commit_info_between_two_commits(git_utility, commit1, commit2, parent_num): +def store_commit_info_between_two_commits(git_utility, commit1, commit2, parent_num, merge_commit, repository_id): commit_list = git_utility.get_commit_list_between_two_commits(commit1, commit2) for commit in commit_list: commit_date = git_utility.get_commit_date(commit) @@ -15,8 +16,8 @@ def store_commit_info_between_two_commits(git_utility, commit1, commit2, parent_ line_changes = git_utility.getChangedLineNumBetweenTwoCommits(commit1, commit2) # Store the merge related commits - merge_related_commits_data = [commit, commit_date, commit_message, branch_name, parent_num] + \ - file_changes + list(line_changes) + merge_related_commits_data = [commit.strip(), commit_date, commit_message, branch_name, parent_num] + \ + file_changes + list(line_changes) + [merge_commit, repository_id] csv_file = open(config.TEMP_CSV_PATH + 'Merge_Related_Commit_{}.csv'.format(git_utility.repository_name), 'a') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(merge_related_commits_data) diff --git a/merge_excavator/repository_data.py b/merge_excavator/repository_data.py index 79965f7..49734e1 100644 --- a/merge_excavator/repository_data.py +++ b/merge_excavator/repository_data.py @@ -30,3 +30,4 @@ def store_repository_info(repository_name): csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"') csv_writer.writerow(repository_data) csv_file.close() + return json_data['id']