diff --git a/hashtheplanet/config/config.py b/hashtheplanet/config/config.py index 3d1622c..0c85625 100644 --- a/hashtheplanet/config/config.py +++ b/hashtheplanet/config/config.py @@ -1,12 +1,20 @@ """ This module handles the config file. """ +from enum import Enum import json from typing import Dict, List from hashtheplanet.resources.git_resource import GitResource from hashtheplanet.resources.npm_resource import NpmResource +class ConfigField(Enum): + """ + This enum contains every field that can be found in the config file + """ + TARGETS = "targets" + EXCLUDE_REGEX = "exclude_regex" + class Config(): """ This class implements methods to manipulate the config file. @@ -25,15 +33,15 @@ def parse(self, config_path: str): with open(config_path, "r", encoding="utf-8") as file_fp: self._config = json.load(file_fp) - def get_targets(self, resource_name: str) -> List[str]: + def get(self, resource_name: str, config_field: ConfigField): """ - This methods returns the targets used by the given resource. + This methods returns a field content used by the given resource. """ - module_info: Dict = self._config.get(resource_name) + field_content: Dict = self._config.get(resource_name) - if module_info is None: - return [] - return module_info.get("targets") + if not config_field or not field_content: + return None + return field_content.get(config_field.value) def get_used_resources(self) -> List[str]: """ diff --git a/hashtheplanet/core/hashtheplanet.py b/hashtheplanet/core/hashtheplanet.py index ba96a82..19519b1 100644 --- a/hashtheplanet/core/hashtheplanet.py +++ b/hashtheplanet/core/hashtheplanet.py @@ -15,7 +15,7 @@ from sqlalchemy.orm import sessionmaker # project imports -from hashtheplanet.config.config import Config +from hashtheplanet.config.config import Config, ConfigField from hashtheplanet.executor.executor import Executor from hashtheplanet.sql.db_connector import Base, DbConnector, Hash @@ -90,10 +90,11 @@ def compute_hashs(self): self._config.parse(self._input_file) for resource_name in self._config.get_used_resources(): - targets = self._config.get_targets(resource_name) + targets = self._config.get(resource_name, ConfigField.TARGETS) or [] + exclude_regex = self._config.get(resource_name, ConfigField.EXCLUDE_REGEX) for target in targets: - self._executor.execute(resource_name, target) + self._executor.execute(resource_name, target, exclude_regex) logger.info("Computing done") diff --git a/hashtheplanet/executor/executor.py b/hashtheplanet/executor/executor.py index 3cba013..a30c98a 100644 --- a/hashtheplanet/executor/executor.py +++ b/hashtheplanet/executor/executor.py @@ -2,6 +2,7 @@ This module handles the resource executions. """ from importlib import import_module +from typing import Optional from loguru import logger @@ -17,7 +18,7 @@ def __init__(self, database: DbConnector, session_scope): self._database = database self._session_scope = session_scope - def execute(self, resource_name: str, target: str): + def execute(self, resource_name: str, target: str, exclude_regex: Optional[str] = None): """ This method executes a resource to compute hashes. """ @@ -31,4 +32,4 @@ def execute(self, resource_name: str, target: str): return resource_instance: Resource = getattr(module, resource_class_name)(self._database) - resource_instance.compute_hashes(self._session_scope, target) + resource_instance.compute_hashes(self._session_scope, target, exclude_regex) diff --git a/hashtheplanet/resources/git_resource.py b/hashtheplanet/resources/git_resource.py index 0faaf08..ade765f 100644 --- a/hashtheplanet/resources/git_resource.py +++ b/hashtheplanet/resources/git_resource.py @@ -6,7 +6,7 @@ import subprocess import tempfile from stat import S_ISDIR, S_ISREG -from typing import List, Tuple +from typing import List, Optional, Tuple # third party imports from git import GitCommandError, Repo @@ -54,10 +54,11 @@ def get_all_files_from_commit(commit: Commit) -> List[Tuple[FilePath, BlobHash]] file_list.append((blob.path, blob.hexsha)) return file_list - @staticmethod def _hash_files( + self, files: List[GitFileMetadata], - repo_dir_path: str + repo_dir_path: str, + exclude_regex: Optional[str] ) -> List[FileMetadata]: """ This method calculates the SHA256 hashes of input files. @@ -69,6 +70,8 @@ def _hash_files( os.chdir(repo_dir_path) for (file_path, tag_name, blob_hash) in files: + if not self.should_save(exclude_regex, file_path): + continue try: # We need to use a subprocess and not the GitPython library # because when we execute "git cat-file -p [blob]" with it, it always removes the \n from the last line. @@ -192,7 +195,7 @@ def _filter_stored_tags(stored_versions: List[VersionTable], found_tags: List[Ta result.append(found_tag) return result - def compute_hashes(self, session_scope, target: str): + def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]): """ This method clones the repository from url, retrieves tags, compares each tags to retrieve only modified files, computes their hashes and then stores the tags & files information in the database. @@ -203,6 +206,7 @@ def compute_hashes(self, session_scope, target: str): with tempfile.TemporaryDirectory() as tmp_dir_name: try: + logger.info(f"Cloning {target}") repo = self.clone_repository(target, tmp_dir_name) except GitCommandError as error: logger.warning(f"Error while cloning repository on {target}: {error}") @@ -225,7 +229,7 @@ def compute_hashes(self, session_scope, target: str): files += self._get_diff_files(tags) logger.info("Generating hashes ...") - files_info = self._hash_files(files, tmp_dir_name) + files_info = self._hash_files(files, tmp_dir_name, exclude_regex) logger.info("Saving hashes ...") self._save_hashes(session_scope, files_info, tags, technology) diff --git a/hashtheplanet/resources/npm_resource.py b/hashtheplanet/resources/npm_resource.py index 8c301fc..3f2f475 100644 --- a/hashtheplanet/resources/npm_resource.py +++ b/hashtheplanet/resources/npm_resource.py @@ -4,7 +4,7 @@ #standard imports import tarfile import tempfile -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple import requests # third party imports @@ -61,8 +61,7 @@ def save_tar_to_disk(file_path: str, npm_module_name: str, version: str): with open(file_path, 'wb') as file_fd: file_fd.write(request.content) - @staticmethod - def extract_hashes_from_tar(file_path: str) -> List[FileMetadata]: + def extract_hashes_from_tar(self, file_path: str, exclude_regex: Optional[str]) -> List[FileMetadata]: """ This method returns all hashes of all files contained in a tar file. """ @@ -72,7 +71,7 @@ def extract_hashes_from_tar(file_path: str) -> List[FileMetadata]: for member in tar.getmembers(): file = tar.extractfile(member) - if file is None: + if file is None or not self.should_save(exclude_regex, member.path): continue files.append((member.path, Hash.hash_bytes(file.read()))) return files @@ -94,7 +93,7 @@ def _save_hashes( self._database.insert_file(session, npm_module_name, file_path) self._database.insert_or_update_hash(session, file_hash, npm_module_name, [version]) - def compute_hashes(self, session_scope, target: str): + def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]): """ This method downloads all versions of an npm module and stores all the versions with their associated files and hashes and stores them in the database. @@ -108,6 +107,6 @@ def compute_hashes(self, session_scope, target: str): file_path = f"{tmp_dir_name}/{target}-{version}.tgz" self.save_tar_to_disk(file_path, target, version) - files_info[version] = self.extract_hashes_from_tar(file_path) + files_info[version] = self.extract_hashes_from_tar(file_path, exclude_regex) self._save_hashes(session_scope, files_info, versions, target) diff --git a/hashtheplanet/resources/resource.py b/hashtheplanet/resources/resource.py index 644e266..e01bfe3 100644 --- a/hashtheplanet/resources/resource.py +++ b/hashtheplanet/resources/resource.py @@ -1,6 +1,9 @@ """ This module contains the base class for the resources. """ +import re +from typing import Optional + from hashtheplanet.sql.db_connector import DbConnector class Resource(): # pylint: disable=too-few-public-methods @@ -12,8 +15,19 @@ class Resource(): # pylint: disable=too-few-public-methods def __init__(self, database: DbConnector): self._database = database - def compute_hashes(self, session_scope, target: str): + def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]): """ This method computes all the versions and their associated files & hashes and stores them in the database. """ raise NotImplementedError() + + @staticmethod + def should_save(exclude_regex: str, file_path: str): + """ + This method permits to verify if the specified file should be saved in the database or not + """ + if not file_path: + return False + if not exclude_regex: + return True + return not re.search(exclude_regex, file_path) diff --git a/src/tech_list.json b/src/tech_list.json index 1ee0d7d..75547d9 100644 --- a/src/tech_list.json +++ b/src/tech_list.json @@ -7,12 +7,14 @@ "https://github.com/drupal/drupal.git", "https://github.com/magento/magento2.git", "https://github.com/joomla/joomla-cms.git" - ] + ], + "exclude_regex": "\\.php|tests\\/|test\\/|\\.gitignore|\\.package|\\.idea|qunit|\\.editorconfig|vendor\\/|\\.vue|\\.scss|\\.less" }, "npm": { "targets": [ "underscore", "jquery" - ] + ], + "exclude_regex": "tests\\/|test\\/|\\.jshintrc|\\.npmignore|\\.bowerrc|\\.jscsrc|\\.bower.json|\\.eslintrc.json|\\.jshintignore|\\.eslintrc|\\.scss|\\.less|\\.vue|\\.scss|\\.less|\\.idea" } } diff --git a/tests/config/test_config.py b/tests/config/test_config.py index e74ad04..8e4e0c8 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -8,7 +8,7 @@ from unittest.mock import MagicMock, mock_open, patch # project imports -from hashtheplanet.config.config import Config +from hashtheplanet.config.config import Config, ConfigField def get_mock_open(files: Dict[str, str]): def open_mock(filename, *args, **kwargs): @@ -61,9 +61,9 @@ def test_get_targets(): config = Config() with patch.dict(config._config, {"git": {"targets": ["target1", "target2"]}}): - assert len(config.get_targets("git")) == 2 + assert len(config.get("git", ConfigField.TARGETS)) == 2 - assert len(config.get_targets("npm")) == 0 + assert not config.get("npm", ConfigField.TARGETS) def test_get_used_resources(): config = Config() diff --git a/tests/resources/test_git_resource.py b/tests/resources/test_git_resource.py index d270faa..712dccb 100644 --- a/tests/resources/test_git_resource.py +++ b/tests/resources/test_git_resource.py @@ -91,10 +91,11 @@ def test_hash_files(): git_resource = GitResource(None) + # when there are no excluded files with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \ mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \ mock.patch("os.chdir", return_value=None) as chdir_mock: - files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path") + files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path", None) assert sp_mock.call_count == 3 sp_mock.assert_called_with(['git', 'cat-file', '-p', 'e42f952edc48e2c085c206166bf4f1ead4d4b058'], shell=False) @@ -114,11 +115,31 @@ def test_hash_files(): assert files_metadata[1][1] == "1.2.5" assert files_metadata[1][2] == hashlib.sha256(blobs.get("e42f952edc48e2c085c206166bf4f1ead4d4b058")).hexdigest() + git_resource = GitResource(None) + + # When the *.cfg files are excluded + with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \ + mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \ + mock.patch("os.chdir", return_value=None) as chdir_mock: + files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path", "\\.cfg$") + + assert sp_mock.call_count == 2 + + getcwd_mock.assert_called_once() + + assert chdir_mock.call_count == 2 + chdir_mock.assert_called_with("/foobar/") + + assert len(files_metadata) == 1 + + assert files_metadata[0][0] == "LICENSE" + assert files_metadata[0][1] == "1.2.3" + assert files_metadata[0][2] == hashlib.sha256(blobs.get("d159169d1050894d3ea3b98e1c965c4058208fe1")).hexdigest() with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \ mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \ mock.patch("os.chdir", return_value=None) as chdir_mock: - files_metadata = git_resource._hash_files([["empty", "1.2.1", "empty"]], "repo_dir_path") + files_metadata = git_resource._hash_files([["empty", "1.2.1", "empty"]], "repo_dir_path", None) assert sp_mock.call_count == 1 sp_mock.assert_called_with(['git', 'cat-file', '-p', 'empty'], shell=False) @@ -134,7 +155,7 @@ def test_hash_files(): with mock.patch.object(subprocess, "check_output", MagicMock(side_effect=ValueError("error"))) as mock_exec, \ mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \ mock.patch("os.chdir", return_value=None) as chdir_mock: - git_resource._hash_files(git_files_metadata, "repo_dir_path") + git_resource._hash_files(git_files_metadata, "repo_dir_path", None) getcwd_mock.assert_called_once() @@ -411,7 +432,7 @@ def mock_tmp_dir(): session = MagicMock() git_resource = GitResource(DbConnector()) - git_resource.compute_hashes(session, repo_url) + git_resource.compute_hashes(session, repo_url, None) # In this situation, we verify that by giving a good repo_url & a good tmp_dir_path # we download the tags, calculate hash & store them in the database @@ -420,7 +441,7 @@ def mock_tmp_dir(): mock_get_tag_files.assert_called_once_with(tags[0]) mock_filter_stored_tags.assert_called_once_with([], tags) mock_get_diff_files.assert_called_once_with(tags) - mock_hash_files.assert_called_once_with([1, 2], tmp_dir_path) + mock_hash_files.assert_called_once_with([1, 2], tmp_dir_path, None) mock_save_hashes.assert_called_once_with(session, "hashed files", tags, "foobar") with patch.object( @@ -435,7 +456,7 @@ def mock_tmp_dir(): patch.object(GitResource, "_hash_files", return_value="hashed files") as mock_hash_files, \ patch.object(GitResource, "_save_hashes") as mock_save_hashes, \ patch.object(DbConnector, "get_versions") as mock_get_versions: - git_resource.compute_hashes(MagicMock(), repo_url) + git_resource.compute_hashes(MagicMock(), repo_url, None) mock_clone_repo.assert_called_once_with(repo_url, tmp_dir_path) # In this situation, we verify that by giving a wrong repository we stop the function diff --git a/tests/resources/test_npm_resource.py b/tests/resources/test_npm_resource.py index 32fe2fb..d1fd6cb 100644 --- a/tests/resources/test_npm_resource.py +++ b/tests/resources/test_npm_resource.py @@ -72,7 +72,7 @@ def mocked_get_request(url: str, *args, **kwargs): def test_extract_hashes_from_tar(): path = "./test.tgz" - members = ["a", "b", None, "c"] + members = ["a", "b.txt", None, "c"] class MockedTarMember(): def __init__(self, member) -> None: @@ -105,10 +105,11 @@ def mocked_open_tar(file_path: str) -> MockedTarFile: assert file_path == path return MockedTarFile() + # when there are no excluded files with mock.patch("tarfile.open", MagicMock(side_effect=mocked_open_tar)) as mock_open_tar: npm_resource = NpmResource(MagicMock()) - files = npm_resource.extract_hashes_from_tar(path) + files = npm_resource.extract_hashes_from_tar(path, None) assert mock_open_tar.called is True assert mock_open_tar.call_count == 1 @@ -117,12 +118,27 @@ def mocked_open_tar(file_path: str) -> MockedTarFile: assert files[0][0] == "a" assert files[0][1] == hashlib.sha256("a".encode("utf-8")).hexdigest() - assert files[1][0] == "b" - assert files[1][1] == hashlib.sha256("b".encode("utf-8")).hexdigest() + assert files[1][0] == "b.txt" + assert files[1][1] == hashlib.sha256("b.txt".encode("utf-8")).hexdigest() assert files[2][0] == "c" assert files[2][1] == hashlib.sha256("c".encode("utf-8")).hexdigest() + # When the *.txt files are excluded + with mock.patch("tarfile.open", MagicMock(side_effect=mocked_open_tar)) as mock_open_tar: + npm_resource = NpmResource(MagicMock()) + + files = npm_resource.extract_hashes_from_tar(path, "\\.txt$") + assert mock_open_tar.called is True + assert mock_open_tar.call_count == 1 + + assert len(files) == 2 + + assert files[0][0] == "a" + assert files[0][1] == hashlib.sha256("a".encode("utf-8")).hexdigest() + + assert files[1][0] == "c" + assert files[1][1] == hashlib.sha256("c".encode("utf-8")).hexdigest() def test_save_hashes(): npm_module_name = "test" @@ -174,7 +190,7 @@ def __exit__(self, *args): def mock_tmp_dir(): return MockDir() - def mock_extract_hashes_from_tar(file_path: str): + def mock_extract_hashes_from_tar(file_path: str, exclude_regex: str): if file_path == f"{tmp_dir_path}/{target}-1.2.3.tgz": return files_info["1.2.3"] elif file_path == f"{tmp_dir_path}/{target}-1.2.4.tgz": @@ -190,7 +206,7 @@ def mock_extract_hashes_from_tar(file_path: str): mock.patch("tempfile.TemporaryDirectory", MagicMock(side_effect=mock_tmp_dir)): npm_resource = NpmResource(MagicMock()) session = MagicMock() - npm_resource.compute_hashes(session, target) + npm_resource.compute_hashes(session, target, None) mock_versions.assert_called_once() mock_tar.call_count == 2 diff --git a/tests/resources/test_resource.py b/tests/resources/test_resource.py index 5f12086..ba63618 100644 --- a/tests/resources/test_resource.py +++ b/tests/resources/test_resource.py @@ -17,7 +17,21 @@ def test_compute_hashes(): resource = Resource("database") try: - resource.compute_hashes(None, None) + resource.compute_hashes(None, None, None) assert False except NotImplementedError as error: assert True + +def test_should_save(): + resource = Resource("test") + + assert resource.should_save(".php", "test.php") is False + assert resource.should_save(".js", "test.php") is True + assert resource.should_save("^tests/", "tests/foobar.js") is False + +def test_should_save_none(): + resource = Resource("test") + + assert resource.should_save(None, "test.php") is True + assert resource.should_save(".js", None) is False + assert resource.should_save(None, None) is False