Skip to content

Commit

Permalink
Merge pull request #55 from avancinirodrigo/develop
Browse files Browse the repository at this point in the history
Adding repository snapshot feature
  • Loading branch information
avancinirodrigo committed Sep 11, 2019
2 parents a611071 + bdb1938 commit 6bc663a
Show file tree
Hide file tree
Showing 15 changed files with 388 additions and 108 deletions.
6 changes: 6 additions & 0 deletions .bettercodehub.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
component_depth: 1
languages:
- python
exclude:
- /poc/.*
- /scripts/.*
2 changes: 1 addition & 1 deletion ecolyzer/repository/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from .person import Person
from .author import Author
from .modification import ModificationInfo, Modification
from .git import Git
from .gitpython import GitPython
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from git import Repo

class Git:
"""Git"""
class GitPython:
"""GitPython"""
def __init__(self, path):
self.repo = Repo(path)
self.git_dir = self.repo.git_dir
Expand Down
24 changes: 21 additions & 3 deletions ecolyzer/repository/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from sqlalchemy.orm.collections import attribute_mapped_collection
from ecolyzer.dataaccess import Base
from .author import Author
from .git import Git
from .commit import Commit
from .gitpython import GitPython

class Repository(Base):
"""Repository"""
Expand All @@ -13,15 +14,17 @@ class Repository(Base):
path = Column(String, nullable=False, unique=True)
_authors = relationship(Author,
collection_class=attribute_mapped_collection('email'))
_commits = relationship(Commit,
collection_class=attribute_mapped_collection('hash'))

def __init__(self, path):
if Git.IsGitRepo(path):
if GitPython.IsGitRepo(path):
self.path = path
else:
raise Exception('Invalid repository path \'{0}\''.format(path))

def add_author(self, author):
if author not in self._authors:
if author.email not in self._authors:
self._authors[author.email] = author
author.repository = self
else:
Expand All @@ -35,3 +38,18 @@ def get_author(self, email):
return self._authors[email]
else:
raise ValueError('Author \'{0}\' not exists'.format(email))

def add_commit(self, commit):
if not self.commit_exists(commit.hash):
self._commits[commit.hash] = commit
else:
raise ValueError('Commit \'{0}\' is already present'.format(commit.hash))

def commit_exists(self, hash):
return hash in self._commits

def get_commit(self, hash):
if self.commit_exists(hash):
return self._commits[hash]
else:
raise ValueError('Commit \'{0}\' not exists'.format(hash))
155 changes: 131 additions & 24 deletions ecolyzer/repository/repository_miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from git import Repo
from ecolyzer.system import File, SourceFile, Operation
from ecolyzer.parser import StaticAnalyzer
from ecolyzer.dataaccess import NullSession
from .commit import CommitInfo, Commit
from .modification import ModificationInfo, Modification
from .person import Person
Expand All @@ -15,10 +16,9 @@ class RepositoryMiner:
def __init__(self, repo, system):
self.repo = repo
self.system = system
self.source_file_extensions = [
#'c', 'cc', 'cpp', 'h', 'hpp', 'hxx',
'lua',
]
self.source_file_extensions = {}
self.source_file_extensions['lua'] = 'lua'
self.ignore_dir_with = {}
self.from_commit = None
self.to_commit = None
self.from_tag = None
Expand All @@ -35,37 +35,138 @@ def tag_interval(self, from_tag, to_tag):

def extract(self, session, hash=None, max_count=sys.maxsize):
count = 0
for commit_driller in RepositoryMining(self.repo.path,
for commit_driller in RepositoryMining(path_to_repo=[self.repo.path],
only_modifications_with_file_types=self.source_file_extensions,
single=hash,
#from_commit=self.from_commit, to_commit=self.to_commit,
from_tag=self.from_tag, to_tag=self.to_tag,
#filepath='CellularSpace.lua',
only_in_branch=['master'],
only_no_merge=self.only_no_merge).traverse_commits():

commit_info = self._get_commit_info(commit_driller)
author = self._check_author(session, commit_info.author_name, commit_info.author_email)
commit = Commit(commit_info, author, self.repo)
session.add(commit)
for mod_info in commit_info.modifications:
filepath = self._check_filepath(mod_info)
if self._is_source_file_ext(File.Extension(filepath)):
file = self._check_file(mod_info)
mod = Modification(mod_info, file, commit)
if mod.status != 'DELETE':
srcfile = self._check_source_file(file)
code_elements = self._extract_code_elements(srcfile, mod.source_code)
for element in code_elements:
code_element = self._check_code_element(session, srcfile, element, mod)
session.add(mod)
self._extract_from_driller(commit_driller, session)
count += 1
if count == max_count:
session.commit()
return
session.commit()

def _check_code_element(self, session, source_file, element, modification):
def _extract_from_driller(self, commit_driller, session):
commit_info = self._get_commit_info(commit_driller)
author = self._check_author(session, commit_info.author_name, commit_info.author_email)
commit = Commit(commit_info, author, self.repo)
for mod_info in commit_info.modifications:
filepath = self._check_filepath(mod_info)
if self._is_valid_source(filepath):
file = self._check_file(mod_info)
mod = Modification(mod_info, file, commit)
if mod.status != 'DELETE':
srcfile = self._check_source_file(file)
code_elements = self._extract_code_elements(srcfile, mod.source_code)
for element in code_elements:
code_element = self._check_code_element(session, srcfile, element, mod)
session.add(mod)

def extract_last_commits(self, session=NullSession(), rev=None):
repo = Repo(self.repo.path)
blobs = self._repo_file_blobs(repo)
for blob in blobs:
commit = self._last_commit_from_path(blob.path, repo, rev)
commit_info = self._get_commit_info_from_gitpython(commit)
author = self._check_author(session, commit_info.author_name, commit_info.author_email)
commit = self._check_commit(commit_info, author)
mod_info = self._get_modification_from_gitpython(blob)
file = self._check_file(mod_info)
mod = Modification(mod_info, file, commit)
srcfile = self._check_source_file(file)
code_elements = self._extract_code_elements(srcfile, mod.source_code)
for element in code_elements:
code_element = self._check_code_element(session, srcfile, element, mod)
session.add(mod)
session.commit()

def _get_commit_info_from_gitpython(self, commit):
commit_info = CommitInfo(commit.hexsha)
commit_info.date = commit.authored_datetime
commit_info.msg = commit.message
committer = commit.committer
commit_info.author_name = committer.name
commit_info.author_email = committer.email
return commit_info

def _get_modification_from_gitpython(self, blob):
file_mod = ModificationInfo(blob.path)
#file_mod.old_path = mod.old_path
file_mod.new_path = blob.path
#file_mod.added = mod.added
#file_mod.removed = mod.removed
#file_mod.status = mod.change_type.name
file_mod.source_code = self._get_blob_source_code(blob)
return file_mod

def _last_commit_from_path(self, fullpath, repo, rev):
return list(repo.iter_commits(rev=rev, paths=fullpath, max_count=1))[0]

def extract_current_files(self, session=NullSession()):
repo = Repo(self.repo.path)
tree = repo.tree()
blobs = self._repo_file_blobs(repo)
self._extract_current_files(blobs, session)

def _repo_file_blobs(self, repo):
tree = repo.tree()
blobs = []
for dir in tree.trees:
self._navigate_dirs(dir.trees, blobs)
self._get_file_blobs(dir, blobs)
return blobs

def _navigate_dirs(self, trees, blobs):
if len(trees) > 0:
for dir in trees:
self._navigate_dirs(dir.trees, blobs)
self._get_file_blobs(dir, blobs)

def _get_file_blobs(self, dir, blobs):
for blob in dir.blobs:
if self._is_valid_source(blob.path):
blobs.append(blob)

def _valid_dir(self, dirpath):
for dir in self.ignore_dir_with:
if dir in dirpath:
return False
return True

def add_ignore_dir_with(self, dir):
self.ignore_dir_with[dir] = dir

def _is_valid_source(self, filepath):
path, filename, ext = File.Split(filepath)
return (self._valid_ext(ext)
and self._valid_dir(path))

def _extract_current_files(self, blobs, session):
for blob in blobs:
if self._is_valid_source(blob.path):
file = self._add_file(blob.path)
srcfile = self._check_source_file(file)
srccode = self._get_blob_source_code(blob)
code_elements = self._extract_code_elements(srcfile, srccode)
for element in code_elements:
code_element = self._check_code_element(session, srcfile, element)
session.add(srcfile)
session.commit()

def _get_blob_source_code(self, blob):
data = blob.data_stream.read()
return data.decode('utf-8')

def _create_modification(self, source_file, source_code): #TODO: use in extract_current_files
mod = ModificationInfo(mod.filename)
mod.new_path = source_file.fullpath()
mod.source_code = source_code

def _check_code_element(self, session, source_file, element, modification=None):
if not source_file.code_element_exists(element):
source_file.add_code_element(element)
element.modification = modification
Expand Down Expand Up @@ -123,7 +224,13 @@ def _check_author(self, session, name, email):
self.repo.add_author(author)
return author

def _is_source_file_ext(self, ext):
def _check_commit(self, commit_info, author):
if not self.repo.commit_exists(commit_info.hash):
commit = Commit(commit_info, author, self.repo)
self.repo.add_commit(commit)
return self.repo.get_commit(commit_info.hash)

def _valid_ext(self, ext):
return ext in self.source_file_extensions

def get_commit_info(self, hash):
Expand Down Expand Up @@ -165,7 +272,7 @@ def _extract_code_elements(self, source_file, source_code):
return []

def is_source_file(self, file):
return self._is_source_file_ext(file.ext)
return self._valid_ext(file.ext)

def extract_code_elements(self, source_file, modification):
return self._extract_code_elements(source_file, modification.source_code)
Expand Down
39 changes: 22 additions & 17 deletions ecolyzer/system/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,7 @@ def fullpath(self, fullpath):
if fullpath == None:
return
self._fullpath = None
path, file_with_ext = os.path.split(fullpath)
filename = ''
ext = ''
if '.' in file_with_ext:
split_list = file_with_ext.split('.')
if len(split_list) > 2:
ext = split_list.pop()
filename = '.'.join(split_list)
else:
if file_with_ext.startswith('.'):
filename = '.' + split_list[1]
else:
filename = split_list[0]
ext = split_list[1]
else:
filename = file_with_ext
path, filename, ext = File.Split(fullpath)
self.path = path
self.name = filename
self.ext = ext
Expand Down Expand Up @@ -99,5 +84,25 @@ def Extension(fullpath):
else:
if not file_with_ext.startswith('.'):
ext = split_list[1]
return ext

@staticmethod
def Split(fullpath):
path, file_with_ext = os.path.split(fullpath)
filename = ''
ext = ''
if '.' in file_with_ext:
split_list = file_with_ext.split('.')
if len(split_list) > 2:
ext = split_list.pop()
filename = '.'.join(split_list)
else:
if file_with_ext.startswith('.'):
filename = '.' + split_list[1]
else:
filename = split_list[0]
ext = split_list[1]
else:
filename = file_with_ext

return ext
return path, filename, ext
40 changes: 40 additions & 0 deletions poc/gitpython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from git import Repo

def show_tree_files(tree, spaces_count):
spaces = ''
spaces += ' ' * spaces_count
for blob in tree.blobs:
#print(blob.name)
if blob.path == 'packages/base/lua/Cell.lua':
print(spaces + '++', blob.name, blob.path, type(blob))
data = blob.data_stream.read()
print(data.decode('ascii'))

def show_tree_dirs(trees, spaces_count):
spaces = ''
spaces += ' ' * spaces_count
if len(trees) > 0:
for tree in trees:
#print(spaces + '--', tree.name)
show_tree_files(tree, spaces_count + 2)
show_tree_dirs(tree.trees, spaces_count + 2)

path = 'repo/terrame'
repo = Repo(path)
git_dir = repo.git_dir

print(git_dir)

tree = repo.tree()
print(type(tree))
#for t in tree.trees:
# print(t.name, type(t))
#show_tree_dirs(t.trees, 2)
#show_tree_files(t, 2)
#print(t.__dict__)
#print(' --', t.trees[0].name)
#for b in t.blobs:
# print(' --', b.name, b.path)

for obj in tree:
print(obj, type(obj.path), list(repo.iter_commits(paths=obj.path, max_count=1))[0])
11 changes: 9 additions & 2 deletions poc/pydriller_methods.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from pydriller import RepositoryMining
from pydriller import RepositoryMining, GitRepository

def show_modification(repo_path):
for commit in RepositoryMining(repo_path, only_in_branch=['master'],
Expand Down Expand Up @@ -27,4 +27,11 @@ def show_modification(repo_path):
return

repo_path = 'repo/terrame'
show_modification(repo_path)
#show_modification(repo_path)

repo = GitRepository(repo_path)
files = repo.files()

for f in files:
if f.endswith('.lua'):
print(f, type(f))
Loading

0 comments on commit 6bc663a

Please sign in to comment.