Skip to content

Commit

Permalink
Add data conversion (CSV -> SQL)
Browse files Browse the repository at this point in the history
  • Loading branch information
mkareshk committed Sep 1, 2018
1 parent 8ce9dbc commit 0a30b74
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 26 deletions.
10 changes: 5 additions & 5 deletions merge_excavator/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

# Keys
GITHUB_KEY = ''
GITHUB_KEY = '0f06f1b13421972ae88b514ac1c680853f23cc7b'

# Paths
REPOSITORY_PATH = '../working_dir/repository/'
Expand All @@ -10,7 +10,7 @@
QUERY_PATH = '../queries/'

# DB information
DB_HOST = ''
DB_NAME = ''
DB_USER_NAME = ''
DB_PASSWORD = ''
DB_HOST = 'localhost'
DB_NAME = 'Merge_Data'
DB_USER_NAME = 'root'
DB_PASSWORD = '123'
16 changes: 10 additions & 6 deletions merge_excavator/data_convertion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,21 @@

cd_to_csv = 'cd {};'.format(config.TEMP_CSV_PATH)
table_list = ['Repository',
'Merge_Replay',
'Conflicting_File',
'Conflicting_Region',
'Merge_Scenario',
'Code_style_violation',
'Merge_Replay',
'Code_Style_Violation',
'Code_Complexity',
'Merge_Related_Commit']

os.system('mysql -u {} -p < {}Merge_Data.sql'.format(config.DB_USER_NAME, config.QUERY_PATH))
os.system('mysql -u {} < {}Merge_Data.sql'.format(config.DB_USER_NAME, config.QUERY_PATH))

os.system(cd_to_csv + 'mkdir temp')
for table in table_list:
os.system(cd_to_csv + 'cat {}_* > {}.csv'.format(table, table))
os.system(cd_to_csv + 'mysqlimport --fields-terminated-by=, --verbose --local'
' -u {} -p {} {}.csv'.format(config.DB_USER_NAME, config.DB_NAME, table))
os.system(cd_to_csv + 'cat {}_* | tr -d "\r" > ./temp/{}.csv'.format(table, table))
# os.system(cd_to_csv + 'mysql -u {} -e "USE {};LOAD DATA LOCAL INFILE \'./temp/{}.csv\' INTO TABLE {} FIELDS TERMINATED BY \',\' ENCLOSED BY \'\' LINES TERMINATED BY \'\n\' ;"'.format(config.DB_USER_NAME, config.DB_NAME, table, table))
os.system(cd_to_csv + 'mysqlimport --fields-escaped-by='' --fields-terminated-by="," --lines-terminated-by="\n" --verbose --local -u root Merge_Data ./temp/{}.csv '.format(table))
os.system(cd_to_csv + 'rm -r temp')


12 changes: 7 additions & 5 deletions merge_excavator/merge_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self):
'\\@\\@\\@ \\-(\\d+),(\\d+) \\-(\\d+),(\\d+) \\+(\\d+),(\\d+) \\@\\@\\@[\\s\\S]*')

def merge_replay(self, repository_name, merge_technique, merge_commit, parents_commit, exec_compile, exec_tests,
exec_conflicting_file, exec_conflicting_region, exec_replay_comparison):
exec_conflicting_file, exec_conflicting_region, exec_replay_comparison, repository_id):
"""
This method replay merges, and store the related information in tables.
:param repository_name: The name of the repository in <USER_NAME>/<REPOSITORY_NAME> format
Expand All @@ -44,6 +44,7 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c
:param exec_conflicting_file: Whether the information of the conflicting files should be stored
:param exec_conflicting_region: Whether the information of the conflicting regions should be stored
:param exec_replay_comparison: Whether the replay and merge commit should compare
:param repository_id: The GitHub id of repository
:return: Nothing
"""

Expand Down Expand Up @@ -100,9 +101,9 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c

# Store the merge replay information
merge_replay_data = [merge_technique, is_conflict, replay_can_compile, replay_can_pass_test, execution_time,
replay_is_equal_to_merge_commit]
replay_is_equal_to_merge_commit, merge_commit, repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Merge_Replay_{}.csv'.format(repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', lineterminator='\n')
csv_writer.writerow(merge_replay_data)
csv_file.close()

Expand All @@ -129,7 +130,8 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c
conflicting_file = rename_add_conflict_match.group(2)

# Store the merge replay information
conflicting_file_data = [conflicting_file, conflict_type]
conflicting_file_data = [conflicting_file.strip(), conflict_type, merge_technique, merge_commit,
repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Conflicting_File_{}.csv'.format(repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(conflicting_file_data)
Expand All @@ -153,7 +155,7 @@ def merge_replay(self, repository_name, merge_technique, merge_commit, parents_c

# Store the conflicting region information
conflicting_region_data = [parent1_path, parent2_path, diff_parent1_start, diff_parent1_length,
diff_parent2_start, diff_parent2_length]
diff_parent2_start, diff_parent2_length, merge_technique, merge_commit, repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Conflicting_Region_{}.csv'.format(repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(conflicting_region_data)
Expand Down
20 changes: 13 additions & 7 deletions merge_excavator/merge_scenario_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

import csv
import time
import numpy
import logging
from dateutil.relativedelta import relativedelta as rd
from time import gmtime, strftime
Expand Down Expand Up @@ -31,7 +32,7 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec
merge_commits = git_utility.get_merge_commits()[1:] #TODO: Why the first one is not in git log?

# Repository Data
store_repository_info(repository_name)
repository_id = store_repository_info(repository_name)


for merge_commit in merge_commits:
Expand Down Expand Up @@ -85,20 +86,22 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec
ancestor_can_compile, ancestor_can_pass_test,
parent1_can_compile, parent1_can_pass_test,
parent2_can_compile, parent2_can_pass_test,
merge_commit_date, ancestor_date, parent1_date, parent2_date, is_pull_request]
merge_commit_date, ancestor_date, parent1_date, parent2_date, is_pull_request,
repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Merge_Scenario_{}.csv'.format(repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(merge_scenario_data)
csv_file.close()

# Merge replay
merge_replay.merge_replay(repository_name, merge_technique, merge_commit, parents_commit, exec_compile, exec_tests,
exec_conflicting_file, exec_conflicting_region, exec_replay_comparison)
exec_conflicting_file, exec_conflicting_region, exec_replay_comparison, repository_id)

# Store the related commits information
if exec_related_commits:
for index, parent in enumerate(parents_commit):
store_commit_info_between_two_commits(git_utility, ancestor_commit, parent, index + 1)
store_commit_info_between_two_commits(git_utility, ancestor_commit, parent, index + 1,
merge_commit, repository_id)

# Store code style violation
if exec_code_style_violation:
Expand All @@ -107,15 +110,18 @@ def get_merge_scenario_info(repository_name, merge_technique, exec_compile, exec
parent1_style_violations = get_code_violation_num(repository_name, parents_commit[0])
parent2_style_violations = get_code_violation_num(repository_name, parents_commit[1])
code_style_violation_data = [merge_commit_style_violations, ancestor_style_violations,
parent1_style_violations, parent2_style_violations]
csv_file = open(config.TEMP_CSV_PATH + 'Code_style_violation_{}.csv'.format(repository_name), 'a')
parent1_style_violations, parent2_style_violations,
merge_commit, repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Code_Style_Violation_{}.csv'.format(repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(code_style_violation_data)
csv_file.close()

# Store code complexity
if exec_complexity:
code_complexity_data = get_code_complexity_diff(repository_name, parents_commit[0], parents_commit[1])
code_complexity_data = get_code_complexity_diff(repository_name, parents_commit[0], parents_commit[1])\
.tolist()\
+[merge_commit, repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Code_Complexity_{}.csv'.format(repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(code_complexity_data)
Expand Down
7 changes: 4 additions & 3 deletions merge_excavator/related_commits.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@

import csv
import numpy as np

from config import *
from GitUtil import *


def store_commit_info_between_two_commits(git_utility, commit1, commit2, parent_num):
def store_commit_info_between_two_commits(git_utility, commit1, commit2, parent_num, merge_commit, repository_id):
commit_list = git_utility.get_commit_list_between_two_commits(commit1, commit2)
for commit in commit_list:
commit_date = git_utility.get_commit_date(commit)
Expand All @@ -15,8 +16,8 @@ def store_commit_info_between_two_commits(git_utility, commit1, commit2, parent_
line_changes = git_utility.getChangedLineNumBetweenTwoCommits(commit1, commit2)

# Store the merge related commits
merge_related_commits_data = [commit, commit_date, commit_message, branch_name, parent_num] + \
file_changes + list(line_changes)
merge_related_commits_data = [commit.strip(), commit_date, commit_message, branch_name, parent_num] + \
file_changes + list(line_changes) + [merge_commit, repository_id]
csv_file = open(config.TEMP_CSV_PATH + 'Merge_Related_Commit_{}.csv'.format(git_utility.repository_name), 'a')
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(merge_related_commits_data)
Expand Down
1 change: 1 addition & 0 deletions merge_excavator/repository_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ def store_repository_info(repository_name):
csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
csv_writer.writerow(repository_data)
csv_file.close()
return json_data['id']

0 comments on commit 0a30b74

Please sign in to comment.