Skip to content

Commit

Permalink
Break out directory fingerprinting logic #69
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <jyang@nexb.com>
  • Loading branch information
JonoYang committed Jun 22, 2023
1 parent f6c7fbe commit f8d79b0
Show file tree
Hide file tree
Showing 8 changed files with 43 additions and 35 deletions.
46 changes: 27 additions & 19 deletions matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,30 +69,38 @@ def create_structure_fingerprint(directory, children):
return _create_directory_fingerprint(features)


def compute_directory_fingerprints(codebase):
def compute_directory_fingerprints(directory, codebase):
"""
Compute fingerprints for a directory from `codebase`
Compute fingerprints for `directory` from `codebase`
"""
for resource in codebase.walk(topdown=False):
if resource.is_file or not resource.path:
continue
children = [r for r in resource.walk(codebase) if r.is_file]
if len(children) == 1:
continue
children = [r for r in directory.walk(codebase) if r.is_file]
if len(children) == 1:
return

directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(resource, 'directory_content_fingerprint'):
resource.directory_content_fingerprint = directory_content_fingerprint
else:
resource.extra_data['directory_content'] = directory_content_fingerprint
directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(directory, 'directory_content_fingerprint'):
directory.directory_content_fingerprint = directory_content_fingerprint
else:
directory.extra_data['directory_content'] = directory_content_fingerprint

directory_structure_fingerprint = create_structure_fingerprint(resource, children)
if hasattr(resource, 'directory_structure_fingerprint'):
resource.directory_structure_fingerprint = directory_structure_fingerprint
else:
resource.extra_data['directory_structure'] = create_structure_fingerprint(resource, children)
directory_structure_fingerprint = create_structure_fingerprint(directory, children)
if hasattr(directory, 'directory_structure_fingerprint'):
directory.directory_structure_fingerprint = directory_structure_fingerprint
else:
directory.extra_data['directory_structure'] = create_structure_fingerprint(directory, children)

directory.save(codebase)
return directory

resource.save(codebase)

def compute_codebase_directory_fingerprints(codebase):
"""
Compute fingerprints for directories from `codebase`
"""
for resource in codebase.walk(topdown=False):
if resource.is_file or not resource.path:
continue
_ = compute_directory_fingerprints(resource, codebase)
return codebase


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints

from scanpipe.pipelines.scan_package import ScanPackage
from scanpipe.pipes.codebase import ProjectCodebase
Expand Down Expand Up @@ -63,4 +63,4 @@ def fingerprint_codebase(self):
Compute directory fingerprints for matching purposes
"""
project_codebase = ProjectCodebase(self.project)
compute_directory_fingerprints(project_codebase)
compute_codebase_directory_fingerprints(project_codebase)
4 changes: 2 additions & 2 deletions matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import POST_SCAN_GROUP
from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from plugincode.post_scan import post_scan_impl
from plugincode.post_scan import PostScanPlugin

Expand Down Expand Up @@ -41,4 +41,4 @@ def is_enabled(self, fingerprint, **kwargs):
return fingerprint

def process_codebase(self, codebase, **kwargs):
codebase = compute_directory_fingerprints(codebase)
codebase = compute_codebase_directory_fingerprints(codebase)
6 changes: 3 additions & 3 deletions matchcode-toolkit/tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
from matchcode_toolkit.fingerprinting import _get_resource_subpath
from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import create_content_fingerprint
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
Expand Down Expand Up @@ -95,10 +95,10 @@ def test_create_halohash_chunks(self):
self.assertEqual(chunk3, expected_chunk3)
self.assertEqual(chunk4, expected_chunk4)

def test_compute_directory_fingerprints(self):
def test_compute_codebase_directory_fingerprints(self):
scan_loc = self.get_test_loc('abbrev-1.0.3-i.json')
vc = VirtualCodebase(location=scan_loc)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
directory_content = vc.root.extra_data['directory_content']
directory_structure = vc.root.extra_data['directory_structure']
expected_directory_content = '0000000346ce04751a3c98f00086f16a91d9790b'
Expand Down
4 changes: 2 additions & 2 deletions matchcode/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from commoncode.resource import VirtualCodebase

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode.models import ApproximateDirectoryContentIndex
from matchcode.models import ApproximateDirectoryStructureIndex
from matchcode.models import ExactPackageArchiveIndex
Expand Down Expand Up @@ -150,5 +150,5 @@ def index_package_directories(package):
if not vc:
return 0, 0

vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
return index_directory_fingerprints(vc, package)
4 changes: 2 additions & 2 deletions matchcode/tests/test_index_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from commoncode.resource import VirtualCodebase

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode.indexing import _create_virtual_codebase_from_package_resources
from matchcode.indexing import index_directory_fingerprints
Expand Down Expand Up @@ -155,7 +155,7 @@ def test__create_virtual_codebase_from_package_resources(self):

def test_index_directory_fingerprints(self):
vc = _create_virtual_codebase_from_package_resources(self.test_package1)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)

# Ensure tables are empty prior to indexing
self.assertFalse(ApproximateDirectoryContentIndex.objects.all())
Expand Down
4 changes: 2 additions & 2 deletions matchcode/tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from commoncode.resource import VirtualCodebase
from packagedb.models import Package

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode.management.commands.index_packages import index_package_directories
from matchcode.match import EXACT_PACKAGE_ARCHIVE_MATCH
from matchcode.match import APPROXIMATE_DIRECTORY_STRUCTURE_MATCH
Expand All @@ -37,7 +37,7 @@ def run_do_match_from_scan(scan_file_location, match_type):
matched_to=attr.ib(default=attr.Factory(list))
)
)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
do_match(vc, match_type)
return vc

Expand Down
6 changes: 3 additions & 3 deletions matchcode/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from packagedb.models import Package
import attr

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode.management.commands.index_packages import index_package_directories
from matchcode.models import ApproximateDirectoryContentIndex
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_ApproximateDirectoryStructureIndex_match_subdir(self):
location=scan_location,
resource_attributes=dict(packages=attr.ib(default=attr.Factory(list)))
)
codebase = compute_directory_fingerprints(vc)
codebase = compute_codebase_directory_fingerprints(vc)

# populate codebase with match results
for resource in codebase.walk(topdown=True):
Expand All @@ -192,7 +192,7 @@ def test_ApproximateDirectoryContentIndex_match_subdir(self):
location=scan_location,
resource_attributes=dict(packages=attr.ib(default=attr.Factory(list)))
)
codebase = compute_directory_fingerprints(vc)
codebase = compute_codebase_directory_fingerprints(vc)

# populate codebase with match results
for resource in codebase.walk(topdown=True):
Expand Down

0 comments on commit f8d79b0

Please sign in to comment.