diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..50a06f1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,32 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +NIL + +## [0.2.0] - 2019-04-20 +Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/v0.1.0) + +### Added +- A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash). +- This changelog. +- Results form a new benchmark run after changes. The `benchmark/run.py` now outputs results files which names include the `dirhash.__version__`. + +### Changed +- **Significant breaking changes** from version 0.1.1 - both regarding API and the +underlying method/protocol for computing the hash. This means that **hashes +computed with this version will differ from hashes computed with version < 0.2.0 for +same directory**. +- This dirhash python implementation has moved to here +[github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from +the previous repository +[github.com/andhus/dirhash](https://github.com/andhus/dirhash) +which now contains the formal description of the Dirhash Standard. + +### Removed +- All support for the `.dirhashignore` file. This seemed superfluous, please file an +issue if you need this feature. diff --git a/README.md b/README.md index d7b8fae..e9c938b 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,15 @@ -[![Build Status](https://travis-ci.com/andhus/dirhash.svg?branch=master)](https://travis-ci.com/andhus/dirhash) -[![codecov](https://codecov.io/gh/andhus/dirhash/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash) +[![Build Status](https://travis-ci.com/andhus/dirhash-python.svg?branch=master)](https://travis-ci.com/andhus/dirhash-python) +[![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python) # dirhash -A lightweight python module and tool for computing the hash of any +A lightweight python module and CLI for computing the hash of any directory based on its files' structure and content. -- Supports any hashing algorithm of Python's built-in `hashlib` module -- `.gitignore` style "wildmatch" patterns for expressive filtering of files to -include/exclude. +- Supports all hashing algorithms of Python's built-in `hashlib` module. +- Glob/wildcard (".gitignore style") path matching for expressive filtering of files to include/exclude. - Multiprocessing for up to [6x speed-up](#performance) +The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations. + ## Installation From PyPI: ```commandline @@ -16,7 +17,7 @@ pip install dirhash ``` Or directly from source: ```commandline -git clone git@github.com:andhus/dirhash.git +git clone git@github.com:andhus/dirhash-python.git pip install dirhash/ ``` @@ -25,16 +26,16 @@ Python module: ```python from dirhash import dirhash -dirpath = 'path/to/directory' -dir_md5 = dirhash(dirpath, 'md5') -filtered_sha1 = dirhash(dirpath, 'sha1', ignore=['.*', '.*/', '*.pyc']) -pyfiles_sha3_512 = dirhash(dirpath, 'sha3_512', match=['*.py']) +dirpath = "path/to/directory" +dir_md5 = dirhash(dirpath, "md5") +pyfiles_md5 = dirhash(dirpath, "md5", match=["*.py"]) +no_hidden_sha1 = dirhash(dirpath, "sha1", ignore=[".*", ".*/"]) ``` CLI: ```commandline dirhash path/to/directory -a md5 -dirhash path/to/directory -a sha1 -i ".* .*/ *.pyc" -dirhash path/to/directory -a sha3_512 -m "*.py" +dirhash path/to/directory -a md5 --match "*.py" +dirhash path/to/directory -a sha1 --ignore ".*" ".*/" ``` ## Why? @@ -66,7 +67,7 @@ and executing `hashlib` code. The main effort to boost performance is support for multiprocessing, where the reading and hashing is parallelized over individual files. -As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash/blob/master/dirhash/cli.py) +As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/cli.py) with the shell command: `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5` @@ -87,7 +88,7 @@ shell reference | nested_32k_32kB | 6.82 | -> 1.0 `dirhash` | nested_32k_32kB | 3.43 | 2.00 `dirhash`(8 workers)| nested_32k_32kB | 1.14 | **6.00** -The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash/tree/master/benchmark). +The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/benchmark). ## Documentation -Please refer to `dirhash -h` and the python [source code](https://github.com/andhus/dirhash/blob/master/dirhash/__init__.py). \ No newline at end of file +Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash/dirhash-python/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash). \ No newline at end of file diff --git a/benchmark/results_v0.2.0.csv b/benchmark/results_v0.2.0.csv new file mode 100644 index 0000000..0e783dc --- /dev/null +++ b/benchmark/results_v0.2.0.csv @@ -0,0 +1,51 @@ +,test_case,implementation,algorithm,workers,t_best,t_median,speed-up (median) +0,flat_8_128MB,shell reference,md5,1,2.079,2.083,1.0 +1,flat_8_128MB,dirhash_impl,md5,1,1.734,1.945,1.0709511568123393 +2,flat_8_128MB,dirhash_impl,md5,2,0.999,1.183,1.760777683854607 +3,flat_8_128MB,dirhash_impl,md5,4,0.711,0.728,2.8612637362637368 +4,flat_8_128MB,dirhash_impl,md5,8,0.504,0.518,4.021235521235521 +5,flat_1k_1MB,shell reference,md5,1,3.383,3.679,1.0 +6,flat_1k_1MB,dirhash_impl,md5,1,1.846,1.921,1.9151483602290473 +7,flat_1k_1MB,dirhash_impl,md5,2,1.137,1.158,3.1770293609671847 +8,flat_1k_1MB,dirhash_impl,md5,4,0.74,0.749,4.911882510013351 +9,flat_1k_1MB,dirhash_impl,md5,8,0.53,0.534,6.889513108614231 +10,flat_32k_32kB,shell reference,md5,1,13.827,18.213,1.0 +11,flat_32k_32kB,dirhash_impl,md5,1,13.655,13.808,1.3190179606025494 +12,flat_32k_32kB,dirhash_impl,md5,2,3.276,3.33,5.469369369369369 +13,flat_32k_32kB,dirhash_impl,md5,4,2.409,2.421,7.522924411400249 +14,flat_32k_32kB,dirhash_impl,md5,8,2.045,2.086,8.731064237775648 +15,nested_1k_1MB,shell reference,md5,1,3.284,3.332,1.0 +16,nested_1k_1MB,dirhash_impl,md5,1,1.717,1.725,1.9315942028985504 +17,nested_1k_1MB,dirhash_impl,md5,2,1.026,1.034,3.222437137330754 +18,nested_1k_1MB,dirhash_impl,md5,4,0.622,0.633,5.263823064770932 +19,nested_1k_1MB,dirhash_impl,md5,8,0.522,0.529,6.29867674858223 +20,nested_32k_32kB,shell reference,md5,1,11.898,12.125,1.0 +21,nested_32k_32kB,dirhash_impl,md5,1,13.858,14.146,0.8571327583769263 +22,nested_32k_32kB,dirhash_impl,md5,2,2.781,2.987,4.059256779377302 +23,nested_32k_32kB,dirhash_impl,md5,4,1.894,1.92,6.315104166666667 +24,nested_32k_32kB,dirhash_impl,md5,8,1.55,1.568,7.732780612244897 +25,flat_8_128MB,shell reference,sha1,1,2.042,2.05,1.0 +26,flat_8_128MB,dirhash_impl,sha1,1,1.338,1.354,1.5140324963072376 +27,flat_8_128MB,dirhash_impl,sha1,2,0.79,0.794,2.5818639798488663 +28,flat_8_128MB,dirhash_impl,sha1,4,0.583,0.593,3.456998313659359 +29,flat_8_128MB,dirhash_impl,sha1,8,0.483,0.487,4.209445585215605 +30,flat_1k_1MB,shell reference,sha1,1,2.118,2.129,1.0 +31,flat_1k_1MB,dirhash_impl,sha1,1,1.39,1.531,1.3905943827563685 +32,flat_1k_1MB,dirhash_impl,sha1,2,0.925,0.932,2.2843347639484977 +33,flat_1k_1MB,dirhash_impl,sha1,4,0.614,0.629,3.384737678855326 +34,flat_1k_1MB,dirhash_impl,sha1,8,0.511,0.52,4.094230769230769 +35,flat_32k_32kB,shell reference,sha1,1,10.551,10.97,1.0 +36,flat_32k_32kB,dirhash_impl,sha1,1,4.663,4.76,2.304621848739496 +37,flat_32k_32kB,dirhash_impl,sha1,2,3.108,3.235,3.3910355486862445 +38,flat_32k_32kB,dirhash_impl,sha1,4,2.342,2.361,4.6463362981787375 +39,flat_32k_32kB,dirhash_impl,sha1,8,2.071,2.094,5.2387774594078325 +40,nested_1k_1MB,shell reference,sha1,1,2.11,2.159,1.0 +41,nested_1k_1MB,dirhash_impl,sha1,1,1.436,1.47,1.4687074829931972 +42,nested_1k_1MB,dirhash_impl,sha1,2,0.925,0.937,2.3041622198505864 +43,nested_1k_1MB,dirhash_impl,sha1,4,0.627,0.643,3.357698289269051 +44,nested_1k_1MB,dirhash_impl,sha1,8,0.516,0.527,4.096774193548386 +45,nested_32k_32kB,shell reference,sha1,1,3.982,7.147,1.0 +46,nested_32k_32kB,dirhash_impl,sha1,1,4.114,4.156,1.7196823869104911 +47,nested_32k_32kB,dirhash_impl,sha1,2,2.598,2.616,2.7320336391437308 +48,nested_32k_32kB,dirhash_impl,sha1,4,1.809,1.831,3.9033315128345167 +49,nested_32k_32kB,dirhash_impl,sha1,8,1.552,1.58,4.523417721518987 diff --git a/benchmark/results_v0.2.0.json b/benchmark/results_v0.2.0.json new file mode 100644 index 0000000..71a652b --- /dev/null +++ b/benchmark/results_v0.2.0.json @@ -0,0 +1,402 @@ +[ + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 2.079, + "t_median": 2.083 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.734, + "t_median": 1.945 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 0.999, + "t_median": 1.183 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.711, + "t_median": 0.728 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.504, + "t_median": 0.518 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.383, + "t_median": 3.679 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.846, + "t_median": 1.921 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.137, + "t_median": 1.158 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.74, + "t_median": 0.749 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.53, + "t_median": 0.534 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 13.827, + "t_median": 18.213 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.655, + "t_median": 13.808 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 3.276, + "t_median": 3.33 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 2.409, + "t_median": 2.421 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 2.045, + "t_median": 2.086 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 3.284, + "t_median": 3.332 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 1.717, + "t_median": 1.725 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 1.026, + "t_median": 1.034 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 0.622, + "t_median": 0.633 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 0.522, + "t_median": 0.529 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "md5", + "workers": 1, + "t_best": 11.898, + "t_median": 12.125 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 1, + "t_best": 13.858, + "t_median": 14.146 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 2, + "t_best": 2.781, + "t_median": 2.987 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 4, + "t_best": 1.894, + "t_median": 1.92 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "md5", + "workers": 8, + "t_best": 1.55, + "t_median": 1.568 + }, + { + "test_case": "flat_8_128MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.042, + "t_median": 2.05 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.338, + "t_median": 1.354 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.79, + "t_median": 0.794 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.583, + "t_median": 0.593 + }, + { + "test_case": "flat_8_128MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.483, + "t_median": 0.487 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.118, + "t_median": 2.129 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.39, + "t_median": 1.531 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.932 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.614, + "t_median": 0.629 + }, + { + "test_case": "flat_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.511, + "t_median": 0.52 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 10.551, + "t_median": 10.97 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.663, + "t_median": 4.76 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 3.108, + "t_median": 3.235 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 2.342, + "t_median": 2.361 + }, + { + "test_case": "flat_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 2.071, + "t_median": 2.094 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 2.11, + "t_median": 2.159 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 1.436, + "t_median": 1.47 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 0.925, + "t_median": 0.937 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 0.627, + "t_median": 0.643 + }, + { + "test_case": "nested_1k_1MB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 0.516, + "t_median": 0.527 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "shell reference", + "algorithm": "sha1", + "workers": 1, + "t_best": 3.982, + "t_median": 7.147 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 1, + "t_best": 4.114, + "t_median": 4.156 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 2, + "t_best": 2.598, + "t_median": 2.616 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 4, + "t_best": 1.809, + "t_median": 1.831 + }, + { + "test_case": "nested_32k_32kB", + "implementation": "dirhash", + "algorithm": "sha1", + "workers": 8, + "t_best": 1.552, + "t_median": 1.58 + } +] \ No newline at end of file diff --git a/benchmark/run.py b/benchmark/run.py index 5dc5ed3..f930b2e 100644 --- a/benchmark/run.py +++ b/benchmark/run.py @@ -6,6 +6,8 @@ from statistics import median, mean +from dirhash import __version__ + BENCHMARK_ROOT = os.path.abspath( os.path.join(__file__, os.pardir) @@ -117,7 +119,7 @@ def get_reference_shell_cmd(dirpath, algorithm): def get_dirhash_shell_cmd(dirpath, algorithm, workers=1): - return 'dirhash {} -a {} -w {}'.format(dirpath, algorithm, workers) + return 'dirhash {} -a {} -j {}'.format(dirpath, algorithm, workers) def benchmark(dirpath, algorithm, **kwargs): @@ -164,7 +166,9 @@ def benchmark(dirpath, algorithm, **kwargs): result = benchmark(test_case, algorithm=alg, runs=5, repetitions=1) results.extend(result) - with open(os.path.join(BENCHMARK_ROOT, 'results.json'), 'w') as f: + result_fname = 'results_v{}'.format(__version__) + + with open(os.path.join(BENCHMARK_ROOT, result_fname + '.json'), 'w') as f: json.dump(results, f, indent=4) try: @@ -188,6 +192,6 @@ def benchmark(dirpath, algorithm, **kwargs): print(df_hd_1w) print('\nAverage speedup multiprocess (8 workers): {}'.format(mean_speedup_8w)) print(df_hd_8w) - df.to_csv(os.path.join(BENCHMARK_ROOT, 'results.csv')) + df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + '.csv')) except ImportError: pass diff --git a/setup.py b/setup.py index 05d7790..242919a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ import os from setuptools import setup, find_packages -VERSION = '0.1.1' +VERSION = '0.2.0' DESCRIPTION = 'Python module and CLI for hashing of file system directories.' @@ -19,14 +19,11 @@ description=DESCRIPTION, long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/andhus/dirhash', + url='https://github.com/andhus/dirhash-python', author="Anders Huss", author_email="andhus@kth.se", license='MIT', - install_requires=[ - 'pathspec>=0.5.9', - 'scandir>=1.9.0;python_version<"3.5"' - ], + install_requires=['scantree>=0.0.1'], packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, diff --git a/src/dirhash/__init__.py b/src/dirhash/__init__.py index 4a4d875..cee8bd6 100644 --- a/src/dirhash/__init__.py +++ b/src/dirhash/__init__.py @@ -1,9 +1,5 @@ #!/usr/bin/env python -"""dirhash - a python module (and CLI) for hashing of file system directories. - -Provides the functions: -- `dirhash` -- `get_included_paths`. +"""dirhash - a python library (and CLI) for hashing of file system directories. """ from __future__ import print_function, division @@ -14,445 +10,588 @@ from functools import partial from multiprocessing import Pool -# Use the built-in version of scandir/walk if possible (python > 3.5), -# otherwise use the scandir module version -try: - from os import scandir -except ImportError: # pragma: no cover - from scandir import scandir - -from pathspec import PathSpec -from pathspec import RecursionError as _RecursionError -from pathspec.patterns import GitWildMatchPattern - +from scantree import ( + scantree, + RecursionFilter, + CyclicLinkedDir, +) + +__all__ = [ + '__version__', + 'algorithms_guaranteed', + 'algorithms_available', + 'dirhash', + 'dirhash_impl', + 'included_paths', + 'Filter', + 'get_match_patterns', + 'Protocol' +] __version__ = pkg_resources.require("dirhash")[0].version algorithms_guaranteed = {'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'} algorithms_available = hashlib.algorithms_available -ignorefilename = '.dirhashignore' - def dirhash( directory, algorithm, - match=None, + match=("*",), ignore=None, + linked_dirs=True, + linked_files=True, + empty_dirs=False, + entry_properties=('name', 'data'), + allow_cyclic_links=False, chunk_size=2**20, - content_only=False, - paths_only=False, - follow_links=True, - include_empty=False, - workers=None, - **kwargs + jobs=1 ): """Computes the hash of a directory based on its structure and content. # Arguments - directory (str | pathlib.Path): Path to the directory to hash. - algorithm (str): The name of the hashing algorithm to use. It is also - possible to provide a callable object that returns an instance - implementing the `hashlib._hashlib.HASH` interface. - match ([str] | None): A list (or iterable) of match-patterns for files to - include when computing the hash. Default `None` which is equivalent to - `['*']`, i.e. everything is included. See "Path Selection and Filtering" + directory: Union[str, pathlib.Path] - Path to the directory to hash. + algorithm: str - The name of the hashing algorithm to use. See + `dirhash.algorithms_available` for the available options. + match: Iterable[str] - An iterable of glob/wildcard match-patterns for paths + to include when computing the hash. Default is ["*"] which means that all + files and directories are matched. To e.g. only include python source + files, use: `match=["*.py"]`. See "Path Selection and Filtering" section below for further details. - ignore ([str] | None): A list (or iterable) of match-patterns for files to - ignore when computing the hash. Default `None` (no ignore patterns). See - "Path Selection and Filtering" below for further details. - chunk_size (int): The number of bytes to read in one go from files while + ignore: Optional[Iterable[str]] - An iterable of glob/wildcard match-patterns + for paths to ignore when computing the hash. Default `None` (no ignore + patterns). To e.g. exclude hidden files and directories use: + `ignore=[".*/", ".*"]`. See "Path Selection and Filtering" section below + for further details. + linked_dirs: bool - If `True` (default), follow symbolic links to other + *directories* and include these and their content in the hash + computation. + linked_files: bool - If `True` (default), include symbolic linked files in + the hash computation. + empty_dirs: bool - If `True`, include empty directories when computing the + hash. A directory is considered empty if it does not contain any files + that *matches provided matching criteria*. Default `False`, i.e. empty + directories are ignored (as is done in git version control). + entry_properties: Iterable[str] - A set (i.e. order does not matter) of the + file/directory properties to consider when computing the hash. Supported + properties are {"name", "data", "is_link"} where at least one of + "name" and "data" must be included. Default is ["name", "data"] which + means that the content (actual data) as well as the path relative to the + root `directory` of files will affect the hash value. See "Entry + Properties Interpretation" section below for further details.  + allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is + raised on presence of cyclic symbolic links. If set to `True` the the + dirhash value for directory causing the cyclic link is replaced with the + hash function hexdigest of the relative path from the link to the target. + chunk_size: int - The number of bytes to read in one go from files while being hashed. A too small size will slow down the processing and a larger size consumes more working memory. Default 2**20 byte = 1 MiB. - content_only (bool): Compute the hash only based on the content of files - - *not* their names or the names of their parent directories. Default - `False`. - NOTE that the tree structure in which files are organized under the - the `directory` root still influences the computed hash. As longs as all - files have the same content and are organised the same way in relation to - all other files in the Directed Acyclic Graph representing the file-tree, - the hash will remain the same (but the "name of nodes" does not matter). - This option can e.g. be used to verify that that data is unchanged after - renaming files (change extensions etc.). - paths_only (bool): Compute the hash only based on the name and location of - files in the file tree under the `directory` root. Default `False`. - This option can e.g. be used to check if any files have been - added/moved/removed, ignoring the content of each file. This is - considerably faster than including content. - follow_links (bool): If true, follow symbolic links to other *directories*. - NOTE that symbolic links to other *files* are always included (as if the - link was the actual file). Default `False`. - include_empty (bool): Include empty directories when computing the hash. A - directory is considered empty if it does not contain any files *matching - provided matching criteria*. Default `False`, i.e. empty directories are - ignored (as with git version control). - workers (int | None): The number of processes to use when computing the hash. - Default `None`, equivalent to `1`, which means no multiprocessing is - used. NOTE that using multiprocessing can significantly speed-up - execution, see `https://github.com/andhus/dirhash/tree/master/benchmark` - for further details. + jobs: int - The number of processes to use when computing the hash. + Default `1`, which means that a single (the main) process is used. NOTE + that using multiprocessing can significantly speed-up execution, see + `https://github.com/andhus/dirhash-python/benchmark` for further + details. # Returns - The hash/checksum as a string the of hexadecimal digits (the result of + str - The hash/checksum as a string of the hexadecimal digits (the result of `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the provided `algorithm`). # Raises - ValueError: For incorrectly provided arguments. + TypeError/ValueError: For incorrectly provided arguments. SymlinkRecursionError: In case the `directory` contains symbolic links that - lead to (infinite) recursion. + lead to (infinite) recursion and `allow_cyclic_links=False` (default). # Path Selection and Filtering - Provided match-patterns determine what paths within the `directory` to - include when computing the hash value. These follow the ".gitignore - wildcard style" of path matching. Paths *relative to the root `directory` - (excluding the name of the directory itself) are matched against the - patterns. + Provided glob/wildcard (".gitignore style") match-patterns determine what + paths within the `directory` to include when computing the hash value. Paths + *relative to the root `directory`* (i.e. excluding the name of the root + directory itself) are matched against the patterns. The `match` argument represent what should be *included* - as opposed - to `ignore` patterns for which matches are *excluded*. Using `ignore` is + to the `ignore` argument for which matches are *excluded*. Using `ignore` is just short for adding the same patterns to the `match` argument with the prefix "!", i.e. the calls bellow are equivalent: - - `dirhash(..., match=['*', '!'])` - `dirhash(..., match=['*', '!'], ignore=[])` - `dirhash(..., match=['*'], ignore=[''])` - `dirhash(..., ignore=[''])` - - If a file named ".dirhashignore" (available by the `dirhash.ignorefilename` - module attribute) exists *directly* under the provided `directory`, then each - line (not starting with "#") of this file is added to the ignore patterns. - - The following kwargs can also be used (possibly together with `match` and/or - `ignore`): - `ignore_extensions` ([str]): list (iterable) of file extensions to - exclude. Short for adding `'*[.]'`to the `ignore` patterns - where the dot [.] is added if does not already start with - a dot. - `ignore_hidden` (bool): Short for adding `['.*', '.*/']` to the `ignore` - patterns, which will exclude hidden files and directories. - - To validate which paths are included, call `dirhash.get_included_paths` with - the same values for the arguments: `match`, `ignore` `follow_links`, - `include_empty`, `ignore_extensions` and `ignore_hidden` to get a list of all - paths that will be included when computing the hash by this function. + `dirhash(..., match=["*", "!"])` + `dirhash(..., ignore=[""])` + To validate which paths are included, call `dirhash.included_paths` with + the same values for the arguments: `match`, `ignore`, `linked_dirs`, + `linked_files` and `empty_dirs` to get a list of all paths that will be + included when computing the hash by this function. + + # Entry Properties Interpretation + - ["name", "data"] (Default) - The name as well as data is included. Due to + the recursive nature of the dirhash computation, "name" implies that the + path relative to the root `directory` of each file/directory affects the + computed hash value. + - ["data"] - Compute the hash only based on the data of files - + *not* their names or the names of their parent directories. NOTE that + the tree structure in which files are organized under the `directory` + root still influences the computed hash. As longs as all files have + the same content and are organised the same way in relation to all + other files in the Directed Acyclic Graph representing the file-tree, + the hash will remain the same (but the "name of nodes" does not + matter). This option can e.g. be used to verify that that data is + unchanged after renaming files (change extensions etc.). + - ["name"] - Compute the hash only based on the name and location of + files in the file tree under the `directory` root. This option can + e.g. be used to check if any files have been added/moved/removed, + ignoring the content of each file. + - "is_link" - if this options is added to any of the cases above the + hash value is also affected by whether a file or directory is a + symbolic link or not. NOTE: with this property added, the hash + will be different than without it even if there are no symbolic links + in the directory. + + # References + See https://github.com/andhus/dirhash/README.md for a formal + description of how the returned hash value is computed. """ - abspath = os.path.abspath(directory) - _verify_is_directory(abspath) + filter_ = Filter( + match_patterns=get_match_patterns(match=match, ignore=ignore), + linked_dirs=linked_dirs, + linked_files=linked_files, + empty_dirs=empty_dirs + ) + protocol = Protocol( + entry_properties=entry_properties, + allow_cyclic_links=allow_cyclic_links + ) + return dirhash_impl( + directory=directory, + algorithm=algorithm, + filter_=filter_, + protocol=protocol, + chunk_size=chunk_size, + jobs=jobs + ) + + +def dirhash_impl( + directory, + algorithm, + filter_=None, + protocol=None, + chunk_size=2**20, + jobs=1 +): + """Computes the hash of a directory based on its structure and content. - if content_only and paths_only: - raise ValueError( - 'only one of arguments `content_only` and `paths_only` can be True') + In contrast to `dirhash.dirhash`, this function accepts custom implementations of + the `dirhash.Filter` and `dirhash.Protocol` classes. + # Arguments + directory: Union[str, pathlib.Path] - Path to the directory to hash. + algorithm: str - The name of the hashing algorithm to use. See + `dirhash.algorithms_available` for the available options. + It is also possible to provide a callable object that returns an instance + implementing the `hashlib._hashlib.HASH` interface. + filter_: dirhash.Filter - Determines what files and directories to include + when computing the hash. See docs of `dirhash.Filter` for further + details. + protocol: dirhash.Protocol - Determines (mainly) what properties of files and + directories to consider when computing the hash value. + chunk_size: int - The number of bytes to read in one go from files while + being hashed. A too small size will slow down the processing and a larger + size consumes more working memory. Default 2**20 byte = 1 MiB. + jobs: int - The number of processes to use when computing the hash. + Default `1`, which means that a single (the main) process is used. NOTE + that using multiprocessing can significantly speed-up execution, see + `https://github.com/andhus/dirhash/tree/master/benchmark` for further + details. + + # Returns + str - The hash/checksum as a string of the hexadecimal digits (the result of + `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the + provided `algorithm`). + + # Raises + TypeError/ValueError: For incorrectly provided arguments. + SymlinkRecursionError: In case the `directory` contains symbolic links that + lead to (infinite) recursion and the protocol option `allow_cyclic_links` + is `False`. + + # References + See https://github.com/andhus/dirhash/README.md for a formal + description of how the returned hash value is computed. + """ + def get_instance(value, cls_, argname): + if isinstance(value, cls_): + return value + if value is None: + return cls_() + raise TypeError('{} must be an instance of {} or None'.format(argname, cls_)) + + filter_ = get_instance(filter_, Filter, 'filter_') + protocol = get_instance(protocol, Protocol, 'protocol') hasher_factory = _get_hasher_factory(algorithm) - match_filter = _get_match_filter(directory, match=match, ignore=ignore, **kwargs) - cache = {} + def dir_apply(dir_node): + if not filter_.empty_dirs: + if dir_node.path.relative == '' and dir_node.empty: + # only check if root node is empty (other empty dirs are filter + # before `dir_apply` with `filter_.empty_dirs=False`) + raise ValueError('{}: Nothing to hash'.format(directory)) + descriptor = protocol.get_descriptor(dir_node) + _dirhash = hasher_factory(descriptor.encode('utf-8')).hexdigest() + + return dir_node.path, _dirhash + + if jobs == 1: + cache = {} + + def file_apply(path): + return path, _get_filehash( + path.real, + hasher_factory, + chunk_size=chunk_size, + cache=cache + ) - if workers is not None and workers > 1: - # extract all (unique) files - _, file_realpaths = _get_leafs( - abspath=abspath, - match_filter=match_filter, - follow_links=follow_links, - include_empty=False, + _, dirhash_ = scantree( + directory, + recursion_filter=filter_, + file_apply=file_apply, + dir_apply=dir_apply, + follow_links=True, + allow_cyclic_links=protocol.allow_cyclic_links, + cache_file_apply=False, + include_empty=filter_.empty_dirs, + jobs=1 + ) + else: # multiprocessing + real_paths = set() + + def extract_real_paths(path): + real_paths.add(path.real) + return path + + root_node = scantree( + directory, + recursion_filter=filter_, + file_apply=extract_real_paths, + follow_links=True, + allow_cyclic_links=protocol.allow_cyclic_links, + cache_file_apply=False, + include_empty=filter_.empty_dirs, + jobs=1 ) + real_paths = list(real_paths) # hash files in parallel - pool = Pool(workers) - try: - file_hashes = pool.map( - partial( - _get_filehash, - hasher_factory=hasher_factory, - chunk_size=chunk_size - ), - file_realpaths - ) - finally: - pool.close() - # prepare the cache with precomputed file hashes - cache = dict(zip(file_realpaths, file_hashes)) - - dirhash = _get_dirhash( - abspath=abspath, - relpath='', - hasher_factory=hasher_factory, - content_only=content_only, - paths_only=paths_only, - chunk_size=chunk_size, - match_filter=match_filter, - follow_links=follow_links, - include_empty=include_empty, - included_leafs=[], - included_file_realpaths=set(), - visited_dirs={}, - cache=cache - ) - if dirhash is _EMPTY: - if include_empty: - return hasher_factory(_empty_dir_descriptor.encode('utf-8')).hexdigest() - else: - raise ValueError('{}: Nothing to hash'.format(directory)) + file_hashes = _parmap( + partial( + _get_filehash, + hasher_factory=hasher_factory, + chunk_size=chunk_size + ), + real_paths, + jobs=jobs + ) + # prepare the mapping with precomputed file hashes + real_path_to_hash = dict(zip(real_paths, file_hashes)) + + def file_apply(path): + return path, real_path_to_hash[path.real] + + _, dirhash_ = root_node.apply(file_apply=file_apply, dir_apply=dir_apply) - return dirhash + return dirhash_ -def get_included_paths( +def included_paths( directory, - match=None, + match=("*",), ignore=None, - follow_links=True, - include_empty=False, - **kwargs + linked_dirs=True, + linked_files=True, + empty_dirs=False, + allow_cyclic_links=False, ): """Inspect what paths are included for the corresponding arguments to the `dirhash.dirhash` function. # Arguments: This function accepts the following subset of the function `dirhash.dirhash` - arguments: `directory`, `match`, `ignore` `follow_links`, `include_empty`, - `ignore_extensions` and `ignore_hidden`, with the same meaning. See docs of - `dirhash.dirhash` for further details. + arguments: `directory`, `match`, `ignore`, `linked_dirs`, `linked_files`, + `empty_dirs` and `allow_cyclic_links`, *with the same interpretation*. See + docs of `dirhash.dirhash` for further details. # Returns - A sorted list of the paths ([str]) that would be included in computing the - hash of `directory` given the provided arguments. + List[str] - A sorted list of the paths that would be included when computing + the hash of the `directory` using `dirhash.dirhash` and the same arguments. """ - abspath = os.path.abspath(directory) - _verify_is_directory(abspath) - match_filter = _get_match_filter(abspath, match=match, ignore=ignore, **kwargs) - included_leafs, _ = _get_leafs( - abspath=abspath, - match_filter=match_filter, - follow_links=follow_links, - include_empty=include_empty, + filter_ = Filter( + match_patterns=get_match_patterns(match=match, ignore=ignore), + linked_dirs=linked_dirs, + linked_files=linked_files, + empty_dirs=empty_dirs ) + protocol = Protocol(allow_cyclic_links=allow_cyclic_links) - return sorted(included_leafs) - + leafpaths = scantree( + directory, + recursion_filter=filter_, + follow_links=True, + allow_cyclic_links=protocol.allow_cyclic_links, + include_empty=filter_.empty_dirs + ).leafpaths() -def _get_leafs( - abspath, - match_filter, - follow_links=True, - include_empty=False, -): - """An inexpensive "dry-run" of the `_get_dirhash` function to get the leaf-paths - that will be included in computing the hash. - """ - included_leafs = [] - included_file_realpaths = set() - _get_dirhash( - abspath=abspath, - relpath='', - hasher_factory=_PlaceHolderHasher, # avoid computing any hash - content_only=False, - paths_only=True, # avoid opening files! - chunk_size=None, # never used - match_filter=match_filter, - follow_links=follow_links, - include_empty=include_empty, - included_leafs=included_leafs, - included_file_realpaths=included_file_realpaths, - visited_dirs={} - ) - return included_leafs, included_file_realpaths + return [ + path.relative if path.is_file() else os.path.join(path.relative, '.') + for path in leafpaths + ] -_null_chr = '\000' -_component_separator = _null_chr -_descriptor_separator = _null_chr * 2 -_dirs_files_separator = _null_chr * 3 -_empty_dir_descriptor = _dirs_files_separator +class Filter(RecursionFilter): + """Specification of what files and directories to include for the `dirhash` + computation. + # Arguments + match: Iterable[str] - An iterable of glob/wildcard (".gitignore style") + match patterns for selection of which files and directories to include. + Paths *relative to the root `directory`* (i.e. excluding the name of the + root directory itself) are matched against the provided patterns. For + example, to include all files, except for hidden ones use: + `match=['*', '!.*']` Default `None` which is equivalent to `['*']`, + i.e. everything is included. + linked_dirs: bool - If `True` (default), follow symbolic links to other + *directories* and include these and their content in the hash + computation. + linked_files: bool - If `True` (default), include symbolic linked files in + the hash computation. + empty_dirs: bool - If `True`, include empty directories when computing the + hash. A directory is considered empty if it does not contain any files + that *matches provided matching criteria*. Default `False`, i.e. empty + directories are ignored (as is done in git version control). + """ + def __init__( + self, + match_patterns=None, + linked_dirs=True, + linked_files=True, + empty_dirs=False + ): + super(Filter, self).__init__( + linked_dirs=linked_dirs, + linked_files=linked_files, + match=match_patterns + ) + self.empty_dirs = empty_dirs -def _verify_is_directory(directory): - if not os.path.exists(directory): - raise ValueError('{}: No such directory'.format(directory)) - if not os.path.isdir(directory): - raise ValueError('{}: Is not a directory'.format(directory)) +def get_match_patterns( + match=None, + ignore=None, + ignore_extensions=None, + ignore_hidden=False, +): + """Helper to compose a list of list of glob/wildcard (".gitignore style") match + patterns based on options dedicated for a few standard use-cases. -def _get_match_filter(dir_abspath, ignore, **kwargs): - """Helper to construct a function for filtering of paths. + # Arguments + match: Optional[List[str]] - A list of match-patterns for files to *include*. + Default `None` which is equivalent to `['*']`, i.e. everything is + included (unless excluded by arguments below). + ignore: Optional[List[str]] - A list of match-patterns for files to + *ignore*. Default `None` (no ignore patterns). + ignore_extensions: Optional[List[str]] - A list of file extensions to + ignore. Short for `ignore=['*.', ...]` Default `None` (no + extensions ignored). + ignore_hidden: bool - If `True` ignore hidden files and directories. Short + for `ignore=['.*', '.*/']` Default `False`. """ + match = ['*'] if match is None else list(match) ignore = [] if ignore is None else list(ignore) - ignore = _parse_ignorefile(dir_abspath) + ignore - - match_spec = _get_match_spec(ignore=ignore, **kwargs) - path_spec = PathSpec.from_lines(GitWildMatchPattern, match_spec) + ignore_extensions = [] if ignore_extensions is None else list(ignore_extensions) - return path_spec.match_files + if ignore_hidden: + ignore.extend(['.*', '.*/']) + for ext in ignore_extensions: + if not ext.startswith('.'): + ext = '.' + ext + ext = '*' + ext + ignore.append(ext) -def _get_dirhash(abspath, *args, **kwargs): - """Entry point of the recursive dirhash implementation, with the only purpose to - provide a more informative error in case of (infinite) symlink recursion. - """ - try: - return _get_dirhash_recursive(os.path.realpath(abspath), *args, **kwargs) - except SymlinkRecursionError as e: - raise SymlinkRecursionError( - real_path=e.real_path, - first_path=os.path.join(abspath, e.first_path), - second_path=os.path.join(abspath, e.second_path) - ) + match_spec = match + ['!' + ign for ign in ignore] + def deduplicate(items): + items_set = set([]) + dd_items = [] + for item in items: + if item not in items_set: + dd_items.append(item) + items_set.add(item) -def _get_dirhash_recursive( - realpath, - relpath, - hasher_factory, - content_only, - paths_only, - chunk_size, - match_filter, - include_empty, - follow_links, - included_leafs, - included_file_realpaths, - visited_dirs, - cache=None -): - """Recursive implementation for computing the hash of a directory based on its - structure and content. + return dd_items - # Arguments - realpath (str): Real path to the current directory to hash. - relpath (str): Path to the current directory relative to the root directory - (entry point) for the recursion. - hasher_factory (f: f() -> hashlib._hashlib.HASH): Callable that returns a - instance of the hashlib._hashlib.HASH interface. - match_filter (f: f(str) -> bool): Function for filtering leaf paths (files - and possibly empty directories) to include. - included_leafs ([str]): Continuously appended list of leaf paths (files - and possibly empty directories) that are included. Used by - `dirhash.get_included_paths`. - included_file_realpaths ({str}): Continuously updated set of the real paths - to included files. Used by `dirhash.dirhash` when files are hashed using - multiprocessing. - visited_dirs ({str: str}): Mapping from real path to root-relative path of - directories visited previously in *current branch* of recursion. Used to - detect if there are symlinks leading to infinite recursion. - cache ({str: str}): Mapping from real file path to hash value of already - hashed files. Used to avoid duplicating hash computations in the case of - repeated occurrence of files by symlinks, as well as to inject - precomputed hashes by the multiprocessing implementation - - For args: `content_only`, `paths_only`, `chunk_size`, `include_empty` and - `follow_links` see docs of `dirhash.dirhash`. + return deduplicate(match_spec) - # Raises - SymlinkRecursionError: in case the current directory has already been - visited in current branch of recursion (i.e. would get infinite recursion - if continuing). - # Side-effects - Continuously updates arguments: `included_leafs`, `included_file_realpaths`, - `visited_dirs` and `cache`. +class Protocol(object): + """Specifications of which file and directory properties to consider when + computing the `dirhash` value. - # Returns - The hash/checksum as a string the of hexadecimal digits of the current - `directory` or `hahsdir._EMPTY` if there are no files or directories to - include. + # Arguments + entry_properties: Iterable[str] - A combination of the supported properties + {"name", "data", "is_link"} where at least one of "name" and "data" is + included. Interpretation: + - ["name", "data"] (Default) - The name as well as data is included. Due + to the recursive nature of the dirhash computation, "name" implies + that the path relative to the root `directory` of each file/directory + affects the computed hash value. + - ["data"] - Compute the hash only based on the data of files - + *not* their names or the names of their parent directories. NOTE that + the tree structure in which files are organized under the `directory` + root still influences the computed hash. As longs as all files have + the same content and are organised the same way in relation to all + other files in the Directed Acyclic Graph representing the file-tree, + the hash will remain the same (but the "name of nodes" does not + matter). This option can e.g. be used to verify that that data is + unchanged after renaming files (change extensions etc.). + - ["name"] - Compute the hash only based on the name and location of + files in the file tree under the `directory` root. This option can + e.g. be used to check if any files have been added/moved/removed, + ignoring the content of each file. + - "is_link" - if this options is added to any of the cases above the + hash value is also affected by whether a file or directory is a + symbolic link or not. NOTE: which this property added, the hash + will be different than without it even if there are no symbolic links + in the directory. + allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is + raised on presence of cyclic symbolic links. If set to `True` the the + dirhash value for directory causing the cyclic link is replaced with the + hash function hexdigest of the relative path from the link to the target. """ - fwd_kwargs = vars() - del fwd_kwargs['realpath'] - del fwd_kwargs['relpath'] - - if follow_links: - if realpath in visited_dirs: - raise SymlinkRecursionError( - real_path=realpath, - # below will be replaced by full abspath in `_get_dirhash` - first_path=visited_dirs[realpath], - second_path=relpath - ) - visited_dirs[realpath] = relpath - - subdirs, files = [], [] - symlink_files = set() - for dir_entry in scandir(realpath): - if dir_entry.is_dir(follow_symlinks=follow_links): - subdirs.append(dir_entry) - elif dir_entry.is_file(follow_symlinks=True): - files.append(dir_entry) - if dir_entry.is_symlink(): - symlink_files.add(dir_entry.name) - - subdir_descriptors = [] - for subdir in subdirs: - if subdir.is_symlink(): - sub_realpath = os.path.realpath(subdir.path) - else: - sub_realpath = subdir.path - sub_relpath = os.path.join(relpath, subdir.name) - sub_dirhash = _get_dirhash_recursive(sub_realpath, sub_relpath, **fwd_kwargs) - if sub_dirhash is _EMPTY: - if not include_empty: - continue - if next(match_filter([sub_relpath]), None) is None: - # dir is not a match - continue - # included empty (leaf) directories represented as `path/to/directory/.` - included_leafs.append(os.path.join(sub_relpath, '.')) - sub_dirhash = hasher_factory( - _empty_dir_descriptor.encode('utf-8') - ).hexdigest() - - if content_only: - subdir_descriptor = sub_dirhash - else: - subdir_descriptor = _component_separator.join([sub_dirhash, subdir.name]) - subdir_descriptors.append(subdir_descriptor) - - subdirs_descriptor = _descriptor_separator.join(sorted(subdir_descriptors)) - - file_descriptors = [] - for file_relpath in match_filter( - os.path.join(relpath, file_.name) for file_ in files + class EntryProperties(object): + NAME = 'name' + DATA = 'data' + IS_LINK = 'is_link' + options = {NAME, DATA, IS_LINK} + _DIRHASH = 'dirhash' + + _entry_property_separator = '\000' + _entry_descriptor_separator = '\000\000' + + def __init__( + self, + entry_properties=('name', 'data'), + allow_cyclic_links=False ): - filename = os.path.basename(file_relpath) - file_realpath = os.path.join(realpath, filename) - if filename in symlink_files: - file_realpath = os.path.realpath(file_realpath) - included_leafs.append(file_relpath) - included_file_realpaths.add(file_realpath) + entry_properties = set(entry_properties) + if not entry_properties.issubset(self.EntryProperties.options): + raise ValueError( + 'entry properties {} not supported'.format( + entry_properties - self.EntryProperties.options) + ) + if not ( + self.EntryProperties.NAME in entry_properties or + self.EntryProperties.DATA in entry_properties + ): + raise ValueError( + 'at least one of entry properties `name` and `data` must be used' + ) + self.entry_properties = entry_properties + self._include_name = self.EntryProperties.NAME in entry_properties + self._include_data = self.EntryProperties.DATA in entry_properties + self._include_is_link = self.EntryProperties.IS_LINK in entry_properties + + if not isinstance(allow_cyclic_links, bool): + raise ValueError( + 'allow_cyclic_link must be a boolean, ' + 'got {}'.format(allow_cyclic_links) + ) + self.allow_cyclic_links = allow_cyclic_links + + def get_descriptor(self, dir_node): + if isinstance(dir_node, CyclicLinkedDir): + return self._get_cyclic_linked_dir_descriptor(dir_node) + + entries = dir_node.directories + dir_node.files + entry_descriptors = [ + self._get_entry_descriptor( + self._get_entry_properties(path, entry_hash) + ) for path, entry_hash in entries + ] + return self._entry_descriptor_separator.join(sorted(entry_descriptors)) + + @classmethod + def _get_entry_descriptor(cls, entry_properties): + entry_strings = [ + '{}:{}'.format(name, value) + for name, value in entry_properties + ] + return cls._entry_property_separator.join(sorted(entry_strings)) + + def _get_entry_properties(self, path, entry_hash): + properties = [] + if path.is_dir(): + properties.append((self.EntryProperties._DIRHASH, entry_hash)) + elif self._include_data: # path is file + properties.append((self.EntryProperties.DATA, entry_hash)) + + if self._include_name: + properties.append((self.EntryProperties.NAME, path.name)) + if self._include_is_link: + properties.append((self.EntryProperties.IS_LINK, path.is_symlink)) + + return properties + + def _get_cyclic_linked_dir_descriptor(self, dir_node): + relpath = dir_node.path.relative + target_relpath = dir_node.target_path.relative + path_to_target = os.path.relpath( + # the extra '.' is needed if link back to root, because + # an empty path ('') is not supported by os.path.relpath + os.path.join('.', target_relpath), + os.path.join('.', relpath) + ) + # TODO normalize posix! + return path_to_target - if paths_only: - file_descriptors.append(filename) - continue - filehash = _get_filehash(file_realpath, hasher_factory, chunk_size, cache) +def _get_hasher_factory(algorithm): + """Returns a "factory" of hasher instances corresponding to the given algorithm + name. Bypasses input argument `algorithm` if it is already a hasher factory + (verified by attempting calls to required methods). + """ + if algorithm in algorithms_guaranteed: + return getattr(hashlib, algorithm) - if content_only: - file_descriptors.append(filehash) - else: - file_descriptors.append(_component_separator.join([filehash, filename])) + if algorithm in algorithms_available: + return partial(hashlib.new, algorithm) - files_descriptor = _descriptor_separator.join(sorted(file_descriptors)) + try: # bypass algorithm if already a hasher factory + hasher = algorithm(b'') + hasher.update(b'') + hasher.hexdigest() + return algorithm + except: + pass - is_empty = (subdirs_descriptor == '' and files_descriptor == '') - if is_empty: - return _EMPTY + raise ValueError( + '`algorithm` must be one of: {}`'.format(algorithms_available)) - descriptor = ''.join( - [subdirs_descriptor, _dirs_files_separator, files_descriptor] - ) - dirhash = hasher_factory(descriptor.encode('utf-8')).hexdigest() +def _parmap(func, iterable, jobs=1): + """Map with multiprocessing.Pool""" + if jobs == 1: + return [func(element) for element in iterable] - if follow_links: - del visited_dirs[realpath] + pool = Pool(jobs) + try: + results = pool.map(func, iterable) + finally: + pool.close() - return dirhash + return results def _get_filehash(filepath, hasher_factory, chunk_size, cache=None): - """Compute the hash for given filepath. + """Compute the hash of the given filepath. # Arguments - filepath (str): Path to the file to hash. - hasher_factory (f: f() -> hashlib._hashlib.HASH): Callable that returns an + filepath: str - Path to the file to hash. + hasher_factory: (f: f() -> hashlib._hashlib.HASH): Callable that returns an instance of the `hashlib._hashlib.HASH` interface. chunk_size (int): The number of bytes to read in one go from files while being hashed. @@ -479,111 +618,3 @@ def _get_filehash(filepath, hasher_factory, chunk_size, cache=None): hasher.update(chunk) return hasher.hexdigest() - - -class SymlinkRecursionError(_RecursionError): - """Raised when symlinks cause a cyclic graph of directories. - - Extends the `pathspec.util.RecursionError` but with a different name (avoid - overriding the built-in error!) and with a more informative string representation - (used in `dirhash.cli`). - """ - def __str__(self): - # _RecursionError.__str__ prints args without context - return 'Symlink recursion: {}'.format(self.message) - - -class _Empty(object): - """The single instance of this class, `_EMPTY` below, is used as return value for - `_get_dirhash_recursive` in the case of an empty directory. - """ - pass - - -_EMPTY = _Empty() - - -def _get_hasher_factory(algorithm): - """Returns a "factory" of hasher instances corresponding to the given algorithm - name. Bypasses input argument `algorithm` if it is already a hasher factory - (verified by attempting calls to required methods). - """ - if algorithm in algorithms_guaranteed: - return getattr(hashlib, algorithm) - - if algorithm in algorithms_available: - return partial(hashlib.new, algorithm) - - try: # bypass algorithm if already a hasher factory - hasher = algorithm(b'') - hasher.update(b'') - hasher.hexdigest() - return algorithm - except: - pass - - raise ValueError( - '`algorithm` must be one of: {}`'.format(algorithms_available)) - - -class _PlaceHolderHasher(object): - """A hasher that does nothing and always returns an empty string. - - Used in the `_get_leafs` "dry-run" of the `_get_dirhash_recursive` function. - """ - - def __init__(self, *args, **kwargs): - pass - - def hexdigest(self): - return '' - - -def _get_match_spec( - match=None, - ignore=None, - ignore_extensions=None, - ignore_hidden=False, -): - """Combines the different arguments for providing match/ignore-patterns into a - single list of match-patterns. - """ - match = ['*'] if match is None else list(match) - ignore = [] if ignore is None else list(ignore) - ignore_extensions = [] if ignore_extensions is None else list(ignore_extensions) - - if ignore_hidden: - ignore.extend(['.*', '.*/']) - - for ext in ignore_extensions: - if not ext.startswith('.'): - ext = '.' + ext - ext = '*' + ext - ignore.append(ext) - - match_spec = match + ['!' + ign for ign in ignore] - - def deduplicate(items): - items_set = set([]) - dd_items = [] - for item in items: - if item not in items_set: - dd_items.append(item) - items_set.add(item) - - return dd_items - - return deduplicate(match_spec) - - -def _parse_ignorefile(directory): - """Parse ignore file in `directory` (if exists) and return a list of ignore - patterns.""" - ignorefilepath = os.path.join(directory, ignorefilename) - if not os.path.exists(ignorefilepath): - return [] - - with open(ignorefilepath) as f: - ignore = [p for p in f.read().splitlines() if not p.startswith('#')] - - return ignore diff --git a/src/dirhash/cli.py b/src/dirhash/cli.py index 80fcb02..06e4044 100644 --- a/src/dirhash/cli.py +++ b/src/dirhash/cli.py @@ -3,7 +3,6 @@ """ from __future__ import print_function -import os import sys import argparse @@ -11,7 +10,25 @@ def main(): - parser = argparse.ArgumentParser(description='Determine the hash for directory.') + try: + kwargs = get_kwargs(sys.argv[1:]) + if kwargs.pop('list'): + # kwargs below have no effect when listing + for k in ['algorithm', 'chunk_size', 'jobs', 'entry_properties']: + kwargs.pop(k) + for leafpath in dirhash.included_paths(**kwargs): + print(leafpath) + else: + print(dirhash.dirhash(**kwargs)) + except Exception as e: # pragma: no cover (not picked up by coverage) + sys.stderr.write('dirhash: {}\n'.format(e)) + sys.exit(1) + + +def get_kwargs(args): + parser = argparse.ArgumentParser( + description='Determine the hash for a directory.' + ) parser.add_argument( '-v', '--version', action='version', @@ -26,146 +43,137 @@ def main(): choices=dirhash.algorithms_available, default='md5', help=( - 'Hashing algorithm to use. Always available: {}. Additionally available ' - 'on current platform: {}. Note that the same algorithm may appear ' - 'multiple times in this set under different names (thanks to ' - 'OpenSSL) [https://docs.python.org/2/library/hashlib.html]'.format( + 'Hashing algorithm to use, by default "md5". Always available: {}. ' + 'Additionally available on current platform: {}. Note that the same ' + 'algorithm may appear multiple times in this set under different names ' + '(thanks to OpenSSL) ' + '[https://docs.python.org/2/library/hashlib.html]'.format( sorted(dirhash.algorithms_guaranteed), sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed) ) ), metavar='' ) - parser.add_argument( + + filter_options = parser.add_argument_group( + title='Filtering options', + description=( + 'Specify what files and directories to include. All files and ' + 'directories (including symbolic links) are included by default. The ' + '--match/--ignore arguments allows for selection using glob/wildcard ' + '(".gitignore style") path matching. Paths relative to the root ' + '`directory` (i.e. excluding the name of the root directory itself) are ' + 'matched against the provided patterns. For example, to only include ' + 'python source files, use: `dirhash path/to/dir -m "*.py"` or to ' + 'exclude hidden files and directories use: ' + '`dirhash path/to.dir -i ".*" ".*/"` which is short for ' + '`dirhash path/to.dir -m "*" "!.*" "!.*/"`. By adding the --list ' + 'argument, all included paths, for the given filtering arguments, are ' + 'returned instead of the hash value. For further details see ' + 'https://github.com/andhus/dirhash/README.md#filtering' + ) + ) + filter_options.add_argument( '-m', '--match', - type=str, - default='*', - help='String of match-patterns, separated by blank space.' + nargs='+', + default=['*'], + help=( + 'One or several patterns for paths to include. NOTE: patterns ' + 'with an asterisk must be in quotes ("*") or the asterisk ' + 'preceded by an escape character (\*).' + ), + metavar='' ) - parser.add_argument( + filter_options.add_argument( '-i', '--ignore', - type=str, + nargs='+', default=None, - help='String of ignore-patterns, separated by blank space.', + help=( + 'One or several patterns for paths to exclude. NOTE: patterns ' + 'with an asterisk must be in quotes ("*") or the asterisk ' + 'preceded by an escape character (\*).' + ), + metavar='' ) - parser.add_argument( - '-d', '--ignore-hidden', + filter_options.add_argument( + '--empty-dirs', action='store_true', default=False, - help='Ignore hidden ("dot") files and directories (short for ' - '`-ignore ".*, "`).' + help='Include empty directories (containing no files that meet the matching ' + 'criteria and no non-empty sub directories).' ) - parser.add_argument( - '-x', '--ignore-extensions', + filter_options.add_argument( + '--no-linked-dirs', + dest='linked_dirs', + action='store_false', + help='Do not include symbolic links to other directories.' + ) + filter_options.add_argument( + '--no-linked-files', + dest='linked_files', + action='store_false', + help='Do not include symbolic links to files.' + ) + parser.set_defaults(linked_dirs=True, linked_files=True) + + protocol_options = parser.add_argument_group( + title='Protocol options', + description=( + 'Specify what properties of files and directories to include and ' + 'whether to allow cyclic links. For further details see ' + 'https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol' + ) + ) + protocol_options.add_argument( + '-p', '--properties', nargs='+', - help='List of file extensions to ignore.', + dest='entry_properties', + default=['data', 'name'], + help=( + 'List of file/directory properties to include in the hash. Available ' + 'properties are: {} and at least one of name and data must be ' + 'included. Default is [data name] which means that both the name/paths' + ' and content (actual data) of files and directories will be included' + ).format(list(dirhash.Protocol.EntryProperties.options)), metavar='' ) - - target_group = parser.add_mutually_exclusive_group(required=False) - target_group.add_argument( - '-c', '--content-only', - action='store_true', + protocol_options.add_argument( + '-c', '--allow-cyclic-links', default=False, - help='Hash only the content of files, not the name and location of files ' - 'within the directory. NOTE (!) the hash will be different if the ' - '(alpha numerical) order of file paths changes.' - ) - target_group.add_argument( - '-p', '--paths-only', action='store_true', - default=False, - help='Hash only the file paths, i.e. the name and location of files ' - 'within the directory.' + help=( + 'Allow presence of cyclic links (by hashing the relative path to the ' + 'target directory).' + ) ) - parser.add_argument( - '--no-follow-links', - dest='follow_links', - action='store_false', - help='Do not follow symbolic links to other *directories*. NOTE: directly ' - 'linked files are always included.' + implementation_options = parser.add_argument_group( + title='Implementation options', + description='' ) - parser.set_defaults(follow_links=True) - parser.add_argument( - '--include-empty', - action='store_true', - default=False, - help='Include empty directories (containing no files that meet the matching ' - 'criteria). Note that the path to the directory itself must still meet ' - 'the matching criteria (matched as if it was a file).' - ) - parser.add_argument( + implementation_options.add_argument( '-s', '--chunk-size', default=2**20, type=int, - help='The chunk size (in bytes) for reading fo files.' + help='The chunk size (in bytes) for reading of files.' ) - parser.add_argument( - '-w', '--workers', + implementation_options.add_argument( + '-j', '--jobs', type=int, - default=1, - help='Number of workers (parallel processes) to use.' + default=1, # TODO make default number of cores? + help='Number of jobs (parallel processes) to use.' ) - parser.add_argument( + + special_options = parser.add_argument_group(title='Special options') + special_options.add_argument( '-l', '--list', action='store_true', default=False, - help='List the file paths that will be taken into account, followed by the ' - 'hash of directory structure' + help='List the file paths that will be taken into account, given the ' + 'provided filtering options.' ) - args = parser.parse_args() - - try: - kwargs = preprocess_kwargs(vars(args)) - if kwargs.pop('list'): - # kwargs below have no effect when listing - for k in [ - 'chunk_size', 'content_only', 'paths_only', 'algorithm', 'workers' - ]: - kwargs.pop(k) - for leafpath in dirhash.get_included_paths(**kwargs): - print(leafpath) - else: - print(dirhash.dirhash(**kwargs)) - except Exception as e: - sys.stderr.write('dirhash: {}\n'.format(e)) - sys.exit(1) - - -def preprocess_kwargs(kwargs): - kwargs['match'] = parse_string_arg(kwargs['match']) - kwargs['ignore'] = parse_string_arg(kwargs['ignore']) - # for consistency with `match` and `ignore`, we allow ignore_extensions to be a - # space separate string (not the recommended usages). - x = 'ignore_extensions' - if kwargs[x] is not None: - if len(kwargs[x]) == 1: - kwargs[x] = parse_string_arg(kwargs[x][0]) - else: - kwargs[x] = [] - - remote_ignorefile = os.environ.get('DIRHASH_IGNORE', None) - root_ignorefile_path = os.path.join(kwargs['directory'], dirhash.ignorefilename) - if os.path.exists(root_ignorefile_path): - kwargs['ignore'] = ( - dirhash._parse_ignorefile(kwargs['directory']) + list(kwargs['ignore'])) - elif remote_ignorefile: - if not os.path.exists(remote_ignorefile): - raise ValueError( - 'DIRHASH_IGNORE={}: No such file'.format(remote_ignorefile) - ) - with open(remote_ignorefile) as f: - kwargs['ignore'] = f.readlines() + kwargs['ignore'] - - return kwargs - - -def parse_string_arg(string_arg): - if string_arg is None or string_arg == '': - return [] - return string_arg.split(' ') + return vars(parser.parse_args(args)) if __name__ == '__main__': # pragma: no cover diff --git a/tests/test_cli.py b/tests/test_cli.py index 558f9cd..3886fb9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -66,9 +66,92 @@ def create_default_tree(tmpdir): class TestCLI(object): + @pytest.mark.parametrize( + 'argstring, non_default_kwargs', + [ + ( + '. -a md5', + {} + ), + ( + '.. -a md5', + {'directory': '..'} + ), + ( + 'target-dir -a md5', + {'directory': 'target-dir'} + ), + ( + '. -a sha256', + {'algorithm': 'sha256'} + ), + # Filtering options + ( + '. -a md5 -m "*" "!.*"', + {'match': ['*', '!.*']} + ), + ( + '. -a md5 --match "d1/*" "d2/*" --ignore "*.txt"', + {'match': ['d1/*', 'd2/*'], 'ignore': ['*.txt']} + ), + ( + '. -a md5 --empty-dirs', + {'empty_dirs': True} + ), + ( + '. -a md5 --no-linked-dirs', + {'linked_dirs': False} + ), + ( + '. -a md5 --no-linked-files', + {'linked_files': False} + ), + # Protocol options + ( + '. -a md5 --allow-cyclic-links', + {'allow_cyclic_links': True} + + ), + ( + '. -a md5 --properties name', + {'entry_properties': ['name']} - def test_preprocess_kwargs(self): - pass + ), + ( + '. -a md5 --properties name data', + {'entry_properties': ['name', 'data']} + + ), + # Implementation + ( + '. -a md5 -j 10', + {'jobs': 10} + ), + ( + '. -a md5 -s 32000', + {'chunk_size': 32000} + ), + ] + ) + def test_get_kwargs(self, argstring, non_default_kwargs): + from dirhash.cli import get_kwargs + kwargs_expected = { + 'list': False, + 'directory': '.', + 'algorithm': 'md5', + 'match': ['*'], + 'ignore': None, + 'empty_dirs': False, + 'linked_dirs': True, + 'linked_files': True, + 'entry_properties': ['data', 'name'], + 'allow_cyclic_links': False, + 'chunk_size': 2 ** 20, + 'jobs': 1 + } + kwargs_expected.update(non_default_kwargs) + kwargs = get_kwargs(shlex.split(argstring)) + assert kwargs == kwargs_expected @pytest.mark.parametrize( 'description, argstrings, output', @@ -78,9 +161,8 @@ def test_preprocess_kwargs(self): '. --list', '. -a md5 --list', '. -a sha256 --list', - '. --content-only --list', - '. --paths-only --list', - '. --workers 2 --list', + '. --properties name --list', + '. --jobs 2 --list', '. --chunk-size 2 --list'], ('.dir/file\n' '.file\n' @@ -89,37 +171,30 @@ def test_preprocess_kwargs(self): 'file.ext1\n' 'file.ext2\n')), ('IGNORE EXTENSION', - ['. -x .ext1 --list', - '. --ignore-extensions .ext1 --list', - '. -i "*.ext1" --list', + ['. -i "*.ext1" --list', '. --ignore "*.ext1" --list', - '. -m "* !*.ext1" --list', - '. --match "* !*.ext1" --list'], + '. -m "*" "!*.ext1" --list', + '. --match "*" "!*.ext1" --list'], ('.dir/file\n' '.file\n' 'dir/file\n' 'file\n' 'file.ext2\n')), ('IGNORE MULTIPLE EXTENSIONS', - ['. -x .ext1 .ext2 --list', - '. -x ".ext1 .ext2" --list', - '. --ignore-extensions .ext1 .ext2 --list', - '. -i "*.ext1 *.ext2" --list', + ['. -i "*.ext1" "*.ext2" --list', '. -i "*.ext*" --list'], ('.dir/file\n' '.file\n' 'dir/file\n' 'file\n')), ('IGNORE HIDDEN', - ['. -d --list', - '. --ignore-hidden --list', - '. -i ".* .*/" --list'], + ['. -i ".*" ".*/" --list'], ('dir/file\n' 'file\n' 'file.ext1\n' 'file.ext2\n')), ('INCLUDE EMPTY', - ['. --include-empty --list'], + ['. --empty-dirs --list'], ('.dir/file\n' '.file\n' 'dir/file\n' @@ -138,104 +213,20 @@ def test_list(self, description, argstrings, output, tmpdir): assert error == '' assert o == output - def test_root_dirhashignore(self, tmpdir): - create_default_tree(tmpdir) - with tmpdir.as_cwd(): - output, error, returncode = dirhash_run('. --list') - assert returncode == 0 - assert error == '' - assert output == ( - '.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n' - ) - - tmpdir.join(dirhash.ignorefilename).write('*.ext*') - with tmpdir.as_cwd(): - output, error, returncode = dirhash_run('. --list') - assert returncode == 0 - assert error == '' - assert output == ( - '.dir/file\n' - '.dirhashignore\n' - '.file\n' - 'dir/file\n' - 'file\n' - ) - - tmpdir.join(dirhash.ignorefilename).write('*.ext*\n#comment\n.*/\n') - with tmpdir.as_cwd(): - output, error, returncode = dirhash_run('. --list') - assert returncode == 0 - assert error == '' - assert output == ( - '.dirhashignore\n' - '.file\n' - 'dir/file\n' - 'file\n' - ) - - def test_remote_dirhashignore(self, tmpdir): - rootdir = tmpdir.mkdir('root') - create_default_tree(rootdir) - remote_dirhashignore = tmpdir.join('my_hashignore') - remote_dirhashignore.write('*.ext*\n#comment\n.*/\n') - - with rootdir.as_cwd(): - output, error, returncode = dirhash_run('. --list') - assert returncode == 0 - assert error == '' - assert output == ( - '.dir/file\n' - '.file\n' - 'dir/file\n' - 'file\n' - 'file.ext1\n' - 'file.ext2\n' - ) - - with rootdir.as_cwd(): - output, error, returncode = dirhash_run( - '. --list', add_env={'DIRHASH_IGNORE': str(remote_dirhashignore)} - ) - assert returncode == 0 - assert error == '' - assert output == ( - '.file\n' - 'dir/file\n' - 'file\n' - ) - - def test_error_on_remote_dirhashignore_does_not_exist(self, tmpdir): - rootdir = tmpdir.mkdir('root') - create_default_tree(rootdir) - remote_dirhashignore = tmpdir.join('non_existing_hashignore') - with rootdir.as_cwd(): - output, error, returncode = dirhash_run( - '. --list', add_env={'DIRHASH_IGNORE': str(remote_dirhashignore)} - ) - assert returncode == 1 - assert error.startswith('dirhash: DIRHASH_IGNORE=') - assert error.endswith(': No such file\n') - assert output == '' - @pytest.mark.parametrize( 'argstring, kwargs, expected_hashes', [ ('. -a md5', {'algorithm': 'md5'}, - ['e0d03dd48ab90d232ffabc0da9f08745', - 'fd1cc95ac2207c3f7d72c18fe01c675e', - '0e4a5d4f8c1e4fda174a04c5693c6ea1'] + ['594c48dde0776b03eddeeb0232190be7', + 'd8ab965636d48e407b73b9dbba4cb928', + '050e7bc9ffcb09c15186c04e0f8026df'] ), ('. -a sha256', {'algorithm': 'sha256'}, - ['f25c5dd69d60c1f127481407829c23e2be87df9d28d3c3e9d353b68cd4f7462d', - 'd444e19712ed1e318917b73a3623b9360e8489854d65586d3b74a6894e980b42', - '8ab8e97f1bca5491c355c22f5f0236079f774e5d19454020d76becaf0c03c346']), + ['23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b', + '7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a', + '7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5']), ] ) def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): @@ -244,8 +235,12 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): create_default_tree(tmpdir) with tmpdir.as_cwd(): for add_argstring, add_kwargs, expected_hash in zip( - ['', ' --content-only', ' --paths-only'], - [{}, {'content_only': True}, {'paths_only': True}], + ['', ' -p data', ' -p name'], + [ + {}, + {'entry_properties': ['data']}, + {'entry_properties': ['name']}, + ], expected_hashes ): # run CLI @@ -257,7 +252,7 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): cli_hash = cli_out[:-1] # run CLI multiproc - full_argstring_mp = argstring + add_argstring + ' --workers 2' + full_argstring_mp = argstring + add_argstring + ' --jobs 2' cli_out_mp, error_mp, returncode_mp = dirhash_run(full_argstring_mp) assert error_mp == '' assert returncode_mp == 0 @@ -270,3 +265,9 @@ def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir): lib_hash = dirhash.dirhash(str(tmpdir), **full_kwargs) assert cli_hash == cli_hash_mp == lib_hash == expected_hash + + def test_error_bad_argument(self, tmpdir): + with tmpdir.as_cwd(): + o, error, returncode = dirhash_run('. --chunk-size not_an_int') + assert returncode > 0 + assert error != '' diff --git a/tests/test_dirhash.py b/tests/test_dirhash.py index 2e9afba..0111d78 100644 --- a/tests/test_dirhash.py +++ b/tests/test_dirhash.py @@ -7,17 +7,20 @@ from time import sleep, time import pytest -from pathspec import RecursionError from dirhash import ( _get_hasher_factory, - _get_match_spec, - get_included_paths, + get_match_patterns, + included_paths, dirhash, algorithms_available, algorithms_guaranteed, - _empty_dir_descriptor + Protocol, + _parmap, + Filter, + dirhash_impl ) +from scantree import SymlinkRecursionError class TestGetHasherFactory(object): @@ -75,55 +78,55 @@ def hexdigest(self): assert hasher_factory is MockHasher -class TestGetMatchSpec(object): +class TestGetMatchPatterns(object): def test_default_match_all(self): - ms = _get_match_spec() + ms = get_match_patterns() assert ms == ['*'] def test_only_match(self): - ms = _get_match_spec(match=['a*', 'b*']) + ms = get_match_patterns(match=['a*', 'b*']) assert ms == ['a*', 'b*'] def test_only_ignore(self): - ms = _get_match_spec(ignore=['a*', 'b*']) + ms = get_match_patterns(ignore=['a*', 'b*']) assert ms == ['*', '!a*', '!b*'] def test_match_and_ignore(self): - ms = _get_match_spec(match=['a*'], ignore=['*.ext']) + ms = get_match_patterns(match=['a*'], ignore=['*.ext']) assert ms == ['a*', '!*.ext'] def test_ignore_hidden(self): - ms = _get_match_spec(ignore_hidden=True) + ms = get_match_patterns(ignore_hidden=True) assert ms == ['*', '!.*', '!.*/'] # should not duplicate if present in (general) ignore - ms = _get_match_spec(ignore=['.*'], ignore_hidden=True) + ms = get_match_patterns(ignore=['.*'], ignore_hidden=True) assert ms == ['*', '!.*', '!.*/'] - ms = _get_match_spec(ignore=['.*/'], ignore_hidden=True) + ms = get_match_patterns(ignore=['.*/'], ignore_hidden=True) assert ms == ['*', '!.*/', '!.*'] - ms = _get_match_spec(ignore=['.*', '.*/'], ignore_hidden=True) + ms = get_match_patterns(ignore=['.*', '.*/'], ignore_hidden=True) assert ms == ['*', '!.*', '!.*/'] def test_ignore_extensions(self): - ms = _get_match_spec(ignore_extensions=['.ext']) + ms = get_match_patterns(ignore_extensions=['.ext']) assert ms == ['*', '!*.ext'] # automatically adds '.' - ms = _get_match_spec(ignore_extensions=['ext']) + ms = get_match_patterns(ignore_extensions=['ext']) assert ms == ['*', '!*.ext'] # mixed also works - ms = _get_match_spec(ignore_extensions=['ext1', '.ext2']) + ms = get_match_patterns(ignore_extensions=['ext1', '.ext2']) assert ms == ['*', '!*.ext1', '!*.ext2'] # should not duplicate if present in (general) ignore - ms = _get_match_spec(ignore=['*.ext'], ignore_extensions=['.ext']) + ms = get_match_patterns(ignore=['*.ext'], ignore_extensions=['.ext']) assert ms == ['*', '!*.ext'] - ms = _get_match_spec(ignore=['*.ext'], ignore_extensions=['ext']) + ms = get_match_patterns(ignore=['*.ext'], ignore_extensions=['ext']) assert ms == ['*', '!*.ext'] @@ -169,11 +172,11 @@ def test_basic(self): self.mkfile('root/d2/f1') expected_filepaths = ['d1/d11/f1', 'd1/f1', 'd2/f1', 'f1'] - filepaths = get_included_paths(self.path_to('root')) + filepaths = included_paths(self.path_to('root')) assert filepaths == expected_filepaths # end with '/' or not should not matter - filepaths = get_included_paths(self.path_to('root/')) + filepaths = included_paths(self.path_to('root/')) assert filepaths == expected_filepaths def test_not_a_directory(self): @@ -181,9 +184,9 @@ def test_not_a_directory(self): self.mkfile('root/f1') # does not exist with pytest.raises(ValueError): - get_included_paths(self.path_to('wrong_root')) + included_paths(self.path_to('wrong_root')) with pytest.raises(ValueError): - get_included_paths(self.path_to('root/f1')) + included_paths(self.path_to('root/f1')) def test_symlinked_file(self): self.mkdirs('root') @@ -191,12 +194,20 @@ def test_symlinked_file(self): self.mkfile('linked_file') self.symlink('linked_file', 'root/f2') - # NOTE `follow_links` hash no effect if only the file is linked (as is the - # case here), linked _files_ are always included. - filepaths = get_included_paths(self.path_to('root'), follow_links=False) + filepaths = included_paths( + self.path_to('root'), + linked_files=True + ) assert filepaths == ['f1', 'f2'] - filepaths = get_included_paths(self.path_to('root'), follow_links=True) + filepaths = included_paths( + self.path_to('root'), + linked_files=False + ) + assert filepaths == ['f1'] + + # default is 'linked_files': True + filepaths = included_paths(self.path_to('root'), ) assert filepaths == ['f1', 'f2'] def test_symlinked_dir(self): @@ -207,26 +218,45 @@ def test_symlinked_dir(self): self.mkfile('linked_dir/f2') self.symlink('linked_dir', 'root/d1') - filepaths = get_included_paths(self.path_to('root'), follow_links=False) + filepaths = included_paths( + self.path_to('root'), + linked_dirs=False + ) assert filepaths == ['f1'] - filepaths = get_included_paths(self.path_to('root'), follow_links=True) + filepaths = included_paths( + self.path_to('root'), + linked_dirs=True + ) assert filepaths == ['d1/f1', 'd1/f2', 'f1'] - # default is `follow_links=True` - filepaths = get_included_paths(self.path_to('root')) + # default is 'linked_dirs': True + filepaths = included_paths(self.path_to('root')) assert filepaths == ['d1/f1', 'd1/f2', 'f1'] - def test_raise_on_infinite_recursion(self): + def test_cyclic_link(self): self.mkdirs('root/d1') self.symlink('root', 'root/d1/link_back') - with pytest.raises(RecursionError) as exc_info: - get_included_paths(self.path_to('root'), follow_links=True) + with pytest.raises(SymlinkRecursionError) as exc_info: + included_paths( + self.path_to('root'), + allow_cyclic_links=False + ) assert exc_info.value.real_path == os.path.realpath(self.path_to('root')) assert exc_info.value.first_path == self.path_to('root/') assert exc_info.value.second_path == self.path_to('root/d1/link_back') assert str(exc_info.value).startswith('Symlink recursion:') + filepaths = included_paths( + self.path_to('root'), + allow_cyclic_links=True + ) + assert filepaths == ['d1/link_back/.'] + + # default is 'allow_cyclic_links': False + with pytest.raises(SymlinkRecursionError): + filepaths = included_paths(self.path_to('root')) + def test_ignore_hidden_files(self): self.mkdirs('root/d1') self.mkdirs('root/.d2') @@ -238,16 +268,20 @@ def test_ignore_hidden_files(self): self.mkfile('root/.d2/f1') # no ignore - filepaths = get_included_paths(self.path_to('root')) + filepaths = included_paths(self.path_to('root')) assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] # with ignore - filepaths = get_included_paths(self.path_to('root'), match=['*', '!.*']) + filepaths = included_paths( + self.path_to('root'), + match=['*', '!.*'] + ) assert filepaths == ['.d2/f1', 'd1/f1', 'f1'] def test_exclude_hidden_dirs(self): self.mkdirs('root/d1') self.mkdirs('root/.d2') + self.mkdirs('root/d1/.d1') self.mkfile('root/f1') self.mkfile('root/.f2') @@ -256,11 +290,14 @@ def test_exclude_hidden_dirs(self): self.mkfile('root/.d2/f1') # no ignore - filepaths = get_included_paths(self.path_to('root')) - assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] + filepaths = included_paths(self.path_to('root'), empty_dirs=True) + assert filepaths == ['.d2/f1', '.f2', 'd1/.d1/.', 'd1/.f2', 'd1/f1', 'f1'] # with ignore - filepaths = get_included_paths(self.path_to('root'), match=['*', '!.*/']) + filepaths = included_paths( + self.path_to('root'), + match=['*', '!.*/'] + ) assert filepaths == ['.f2', 'd1/.f2', 'd1/f1', 'f1'] def test_exclude_hidden_dirs_and_files(self): @@ -274,11 +311,11 @@ def test_exclude_hidden_dirs_and_files(self): self.mkfile('root/.d2/f1') # no ignore - filepaths = get_included_paths(self.path_to('root')) + filepaths = included_paths(self.path_to('root')) assert filepaths == ['.d2/f1', '.f2', 'd1/.f2', 'd1/f1', 'f1'] # using ignore - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), match=['*', '!.*/', '!.*'] ) @@ -298,7 +335,7 @@ def test_exclude_extensions(self): self.mkfile('root/d1/f.txt') self.mkfile('root/d1/f.skip1') - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), match=['*', '!*.skip1', '!*.skip2'] ) @@ -314,14 +351,20 @@ def test_empty_dirs_include_vs_exclude(self): self.mkfile('root/d1/f') self.mkfile('root/d3/d31/f') - filepaths = get_included_paths(self.path_to('root'), include_empty=False) + filepaths = included_paths( + self.path_to('root'), + empty_dirs=False + ) assert filepaths == ['d1/f', 'd3/d31/f'] # `include_empty=False` is default - filepaths = get_included_paths(self.path_to('root')) + filepaths = included_paths(self.path_to('root')) assert filepaths == ['d1/f', 'd3/d31/f'] - filepaths = get_included_paths(self.path_to('root'), include_empty=True) + filepaths = included_paths( + self.path_to('root'), + empty_dirs=True + ) assert filepaths == ['d1/f', 'd2/.', 'd3/d31/f', 'd4/d41/.'] def test_empty_dirs_because_of_filter_include_vs_exclude(self): @@ -331,63 +374,63 @@ def test_empty_dirs_because_of_filter_include_vs_exclude(self): self.mkfile('root/d1/f') self.mkfile('root/d2/.f') - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), match=['*', '!.*'], - include_empty=False + empty_dirs=False ) assert filepaths == ['d1/f'] # `include_empty=False` is default - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), match=['*', '!.*'], ) assert filepaths == ['d1/f'] - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), match=['*', '!.*'], - include_empty=True + empty_dirs=True ) assert filepaths == ['d1/f', 'd2/.'] - def test_empty_dir_not_included_due_to_not_match(self): + def test_empty_dir_inclusion_not_affected_by_match(self): self.mkdirs('root/d1') self.mkdirs('root/.d2') - filepaths = get_included_paths( + # NOTE that empty dirs are not excluded by match_patterns: + + filepaths = included_paths( self.path_to('root'), match=['*', '!.*'], - include_empty=True + empty_dirs=True ) - assert filepaths == ['d1/.'] + assert filepaths == ['.d2/.', 'd1/.'] - # NOTE that empty dirs are matched as is they were files (leafs!) - # TODO better option? - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), match=['*', '!.*/'], - include_empty=True + empty_dirs=True ) assert filepaths == ['.d2/.', 'd1/.'] - filepaths = get_included_paths( + filepaths = included_paths( self.path_to('root'), - match=['*', '!d1/'], - include_empty=True + match=['*', '!d1'], + empty_dirs=True ) assert filepaths == ['.d2/.', 'd1/.'] def dirhash_mp_comp(*args, **kwargs): res = dirhash(*args, **kwargs) - res_mp = dirhash(workers=2, *args, **kwargs) + res_mp = dirhash(jobs=2, *args, **kwargs) assert res == res_mp return res -class Testdirhash(TempDirTest): +class TestDirhash(TempDirTest): def test_guaranteed_algorithms(self): self.mkdirs('root/d1/d11') @@ -398,22 +441,49 @@ def test_guaranteed_algorithms(self): self.mkfile('root/d2/f1', 'd') for algorithm, expected_hash in [ - ('md5', '23315916fc3a935b5ed3e120a202aea4'), - ('sha1', '6119b22d2916a4af7032802cdb95c742a217fe9f'), - ('sha224', 'cdb3a780741c08d6c4ffc6aa0725787f6fbef3e80d81c8850215ef61'), - ('sha256', '6fa5594ea7fb6a05fd36c152e6576522' - 'a5f37b07c2d797f2ed96527ae18f3fe3'), - ('sha384', '453ebd36d95e24149f184589df49f69b' - 'f289af3e889c916cc93f0e02367f4d48' - 'aef2593ef29f0ecdf3b6e05572e90066'), - ('sha512', 'f52ac9eeeb5160637afa91f1f20f1a60' - 'ce80a55ac3757f8bb9225e10edc131b4' - '2da10497706ef4f06d36f13dae77540b' - 'c0e5484c7f79f87a83c76ae103fff4fa') + ('md5', '3c631c7f5771468a2187494f802fad8f'), + ('sha1', '992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41'), + ('sha224', '18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999'), + ('sha256', 'ef7e95269fbc0e3478ad31fddd1c7d08' + '907d189c61725332e8a2fd14448fe175'), + ('sha384', '64ef4360c172bc68250f9326ea231cd1' + '46a7fa1afe9d386cee0cae0e9f1b4ad2' + '1df050d1df436cff792bbe81d6698026'), + ('sha512', '7854226eb0278bc136056998890a8399' + 'f85ca383f7c54665026358d28b5dc716' + '0ec654d2bcebf5d60974f82ed820600d' + '8e807ea53d57578d076ec1c82f501208') ]: hash_value = dirhash_mp_comp(self.path_to('root'), algorithm) assert hash_value == expected_hash + def test_recursive_descriptor(self): + self.mkdirs('root/d1') + self.mkdirs('root/d2') + self.mkfile('root/f1', 'a') + self.mkfile('root/d1/f12', 'b') + + f1_desc = 'data:a\000name:f1' + f12_desc = 'data:b\000name:f12' + d1_desc = 'dirhash:{}\000name:d1'.format(f12_desc) + d2_desc = 'dirhash:\000name:d2' + + empty_dirs_false_expected = '\000\000'.join([f1_desc, d1_desc]) + empty_dirs_true_expected = '\000\000'.join([f1_desc, d2_desc, d1_desc]) + + empty_dirs_false = dirhash( + self.path_to('root'), + algorithm=IdentityHasher + ) + assert empty_dirs_false == empty_dirs_false_expected + + empty_dirs_true = dirhash( + self.path_to('root'), + algorithm=IdentityHasher, + empty_dirs=True + ) + assert empty_dirs_true == empty_dirs_true_expected + def test_symlinked_file(self): self.mkdirs('root1') self.mkfile('root1/f1', 'a') @@ -424,16 +494,20 @@ def test_symlinked_file(self): self.mkfile('root2/f1', 'a') self.mkfile('root2/f2', 'b') - root1_follow_true = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5', follow_links=True) - root1_follow_false = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5', follow_links=False) + root1_linked_files_true = dirhash_mp_comp( + self.path_to('root1'), algorithm='md5' + ) + root1_linked_files_false = dirhash_mp_comp( + self.path_to('root1'), algorithm='md5', + linked_files=False + ) + root2 = dirhash_mp_comp( - self.path_to('root2'), algorithm='md5') + self.path_to('root2'), algorithm='md5' + ) - # NOTE `follow_links` hash no effect if only the file is linked (as is the - # case here), linked _files_ are always included. - assert root1_follow_false == root1_follow_true == root2 + assert root1_linked_files_false != root1_linked_files_true + assert root1_linked_files_true == root2 def test_symlinked_dir(self): self.mkdirs('root1') @@ -449,15 +523,22 @@ def test_symlinked_dir(self): self.mkfile('root2/d1/f1', 'b') self.mkfile('root2/d1/f2', 'c') - root1_follow_true = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5', follow_links=True) - root1_follow_false = dirhash_mp_comp( - self.path_to('root1'), algorithm='md5', follow_links=False) + root1_linked_dirs_true = dirhash_mp_comp( + self.path_to('root1'), + algorithm='md5', + linked_dirs=True + ) + root1_linked_dirs_false = dirhash_mp_comp( + self.path_to('root1'), + algorithm='md5', + linked_dirs=False + ) root2 = dirhash_mp_comp( - self.path_to('root2'), algorithm='md5') + self.path_to('root2'), algorithm='md5' + ) - assert root1_follow_false != root1_follow_true - assert root1_follow_true == root2 + assert root1_linked_dirs_false != root1_linked_dirs_true + assert root1_linked_dirs_true == root2 def test_cache_used_for_symlinks(self): @@ -480,11 +561,13 @@ def test_raise_on_empty_root_without_include_empty(self): def test_empty_root_include_empty(self): self.mkdirs('root') - dirhash = dirhash_mp_comp(self.path_to('root'), 'sha256', include_empty=True) - expected_dirhash = hashlib.sha256( - _empty_dir_descriptor.encode('utf-8') - ).hexdigest() - assert dirhash == expected_dirhash + dirhash_ = dirhash_mp_comp( + self.path_to('root'), + 'sha256', + empty_dirs=True + ) + expected_dirhash = hashlib.sha256(''.encode('utf-8')).hexdigest() + assert dirhash_ == expected_dirhash def test_include_empty(self): self.mkdirs('root/d1') @@ -492,9 +575,15 @@ def test_include_empty(self): self.mkfile('root/d1/f') args = (self.path_to('root'), 'sha256') - dirhash = dirhash_mp_comp(*args, include_empty=False) - dirhash_empty = dirhash_mp_comp(*args, include_empty=True) - assert dirhash != dirhash_empty + dirhash_ = dirhash_mp_comp( + *args, + empty_dirs=False + ) + dirhash_empty = dirhash_mp_comp( + *args, + empty_dirs=True + ) + assert dirhash_ != dirhash_empty def test_chunksize(self): self.mkdirs('root') @@ -502,12 +591,13 @@ def test_chunksize(self): hash_value = dirhash_mp_comp(self.path_to('root'), 'sha256') for chunk_size in [2**4, 2**8, 2**16]: - assert ( - dirhash_mp_comp(self.path_to('root'), 'sha256', chunk_size=chunk_size) == - hash_value - ) + assert dirhash_mp_comp( + self.path_to('root'), + 'sha256', + chunk_size=chunk_size + ) == hash_value - def test_content_only(self): + def test_data_only(self): self.mkdirs('root1') self.mkfile('root1/a.txt', 'abc') self.mkfile('root1/b.txt', 'def') @@ -519,13 +609,18 @@ def test_content_only(self): hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') assert hash1 != hash2 - # with `content_only` hash remains the same as long as order of files is the - # same (based on sorting of file paths) - chash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256', content_only=True) - chash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256', content_only=True) - assert chash1 == chash2 + # with entry hash remains the same as long as order of files is the + # same + [dhash1, dhash2] = [ + dirhash_mp_comp( + self.path_to(root), + 'sha256', + entry_properties=['data'] + ) for root in ['root1', 'root2'] + ] + assert dhash1 == dhash2 - def test_paths_only(self): + def test_name_only(self): self.mkdirs('root1') self.mkfile('root1/a.txt', 'abc') self.mkfile('root1/b.txt', 'def') @@ -537,61 +632,59 @@ def test_paths_only(self): hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') assert hash1 != hash2 - chash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256', paths_only=True) - chash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256', paths_only=True) - assert chash1 == chash2 - - def test_raise_on_content_only_and_paths_only(self): - self.mkdirs('root1') - self.mkfile('root1/a.txt', 'abc') - dirhash_mp_comp(self.path_to('root1'), 'sha256') # ok! - with pytest.raises(ValueError): + [dhash1, dhash2] = [ dirhash_mp_comp( - self.path_to('root1'), + self.path_to(root), 'sha256', - content_only=True, - paths_only=True - ) + entry_properties=['name'] + ) for root in ['root1', 'root2'] + ] + assert dhash1 == dhash2 - def test_collision_attempt(self): + def test_is_link_property(self): self.mkdirs('root1') - self.mkfile('root1/ab') - self.mkfile('root1/c') - hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') - + self.mkfile('root1/a.txt', 'abc') + self.mkfile('root1/b.txt', 'def') self.mkdirs('root2') - self.mkfile('root2/a') - self.mkfile('root2/bc') - hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + self.mkfile('b_target', 'def') + self.mkfile('root2/a.txt', 'abc') + self.symlink('b_target', 'root2/b.txt') - assert not hash1 == hash2 + hash1 = dirhash_mp_comp(self.path_to('root1'), 'sha256') + hash2 = dirhash_mp_comp(self.path_to('root2'), 'sha256') + assert hash1 == hash2 - def test_ignorefile(self): + for entry_properties in [ + ['name', 'data', 'is_link'], + ['name', 'is_link'], + ['data', 'is_link'], + ]: + [hash1, hash2] = [ + dirhash_mp_comp( + self.path_to(root), + 'sha256', + entry_properties=entry_properties + ) for root in ['root1', 'root2'] + ] + assert hash1 != hash2 + + def test_raise_on_not_at_least_one_of_name_and_data(self): self.mkdirs('root1') - self.mkdirs('root2') - for fname in ['a', '.b', 'c.txt']: - self.mkfile(os.path.join('root1', fname)) - self.mkfile(os.path.join('root2', fname)) + self.mkfile('root1/a.txt', 'abc') + dirhash_mp_comp(self.path_to('root1'), 'sha256') # check ok + with pytest.raises(ValueError): + dirhash_mp_comp( + self.path_to('root1'), + 'sha256', + entry_properties=[] + ) - ignorefile = ( - '# my dirhash ignore patterns\n' - '.*\n' - ) - self.mkfile('root1/.dirhashignore', ignorefile) - assert ( - dirhash_mp_comp(self.path_to('root1'), 'sha256') == - dirhash_mp_comp(self.path_to('root2'), 'sha256', ignore=['.*']) - ) - assert ( - dirhash_mp_comp(self.path_to('root1'), 'sha256', ignore=['*.txt']) == - dirhash_mp_comp(self.path_to('root2'), 'sha256', ignore=['.*', '*.txt']) - ) - # ignore file should _not_ be ignored by default: - self.mkfile('root1/.dirhashignore', '# empty ignorefile') - assert ( - dirhash_mp_comp(self.path_to('root1'), 'sha256') != - dirhash_mp_comp(self.path_to('root2'), 'sha256') - ) + with pytest.raises(ValueError): + dirhash_mp_comp( + self.path_to('root1'), + 'sha256', + entry_properties=['is_link'] + ) def test_multiproc_speedup(self): @@ -609,14 +702,126 @@ def test_multiproc_speedup(self): assert elapsed_sequential > expected_min_elapsed start = time() - dirhash(self.path_to('root'), algorithm=SlowHasher, workers=num_files) + dirhash(self.path_to('root'), algorithm=SlowHasher, jobs=num_files) end = time() elapsed_muliproc = end - start - assert elapsed_muliproc < expected_min_elapsed / 2 # at least half! + assert elapsed_muliproc < expected_min_elapsed + # just check "any speedup", the overhead varies (and is high on Travis) + + def test_cache_by_real_path_speedup(self, tmpdir): + num_links = 10 + + # reference run without links + root1 = tmpdir.join('root1') + root1.ensure(dir=True) + for i in range(num_links): + file_i = root1.join('file_{}'.format(i)) + file_i.write('< one chunk content', ensure=True) + + wait_time = SlowHasher.wait_time + expected_min_elapsed = wait_time * num_links + start = time() + dirhash(root1, algorithm=SlowHasher) + end = time() + elapsed_sequential = end - start + assert elapsed_sequential > expected_min_elapsed + overhead = elapsed_sequential - expected_min_elapsed + + # all links to same file + root2 = tmpdir.join('root2') + root2.ensure(dir=True) + target_file = tmpdir.join('target_file') + target_file.ensure() + for i in range(num_links): + root2.join('link_{}'.format(i)).mksymlinkto(target_file) + + overhead_margin_factor = 1.5 + expected_max_elapsed = overhead * overhead_margin_factor + wait_time + assert expected_max_elapsed < expected_min_elapsed + start = time() + dirhash(root2, algorithm=SlowHasher) + end = time() + elapsed_cache = end - start + assert elapsed_cache < expected_max_elapsed + + def test_cache_together_with_multiprocess_speedup(self, tmpdir): + target_file_names = ['target_file_1', 'target_file_2'] + num_links_per_file = 10 + num_links = num_links_per_file * len(target_file_names) + + # reference run without links + root1 = tmpdir.join('root1') + root1.ensure(dir=True) + for i in range(num_links): + file_i = root1.join('file_{}'.format(i)) + file_i.write('< one chunk content', ensure=True) + + jobs = 2 + wait_time = SlowHasher.wait_time + expected_min_elapsed = wait_time * num_links / jobs + start = time() + dirhash(root1, algorithm=SlowHasher, jobs=jobs) + end = time() + elapsed_sequential = end - start + assert elapsed_sequential > expected_min_elapsed + overhead = elapsed_sequential - expected_min_elapsed + + root2 = tmpdir.join('root2') + root2.ensure(dir=True) + for i, target_file_name in enumerate(target_file_names): + target_file = tmpdir.join(target_file_name) + target_file.write('< one chunk content', ensure=True) + for j in range(num_links_per_file): + root2.join('link_{}_{}'.format(i, j)).mksymlinkto(target_file) + + overhead_margin_factor = 1.5 + expected_max_elapsed = overhead * overhead_margin_factor + wait_time * 2 + assert expected_max_elapsed < expected_min_elapsed + start = time() + dirhash(root2, algorithm=SlowHasher, jobs=jobs) + end = time() + elapsed_mp_cache = end - start + assert elapsed_mp_cache < expected_max_elapsed + + def test_hash_cyclic_link_to_root(self): + self.mkdirs('root/d1') + self.symlink('root', 'root/d1/link_back') + dirhash( + self.path_to('root'), + 'sha256', + allow_cyclic_links=True + ) + + def test_hash_cyclic_link(self): + self.mkdirs('root/d1/d2') + self.symlink('root/d1', 'root/d1/d2/link_back') + dirhash( + self.path_to('root'), + 'sha256', + allow_cyclic_links=True + ) + + def test_pass_filtering_instance(self): + self.mkdirs('root') + self.mkfile('root/f1', '') + dirhash_impl(self.path_to('root'), 'sha256', filter_=Filter()) + + def test_pass_protocol_instance(self): + self.mkdirs('root') + self.mkfile('root/f1', '') + dirhash_impl(self.path_to('root'), 'sha256', protocol=Protocol()) + + def test_raise_on_wrong_type(self): + self.mkdirs('root') + self.mkfile('root/f1', '') + with pytest.raises(TypeError): + dirhash_impl(self.path_to('root'), 'sha256', filter_='') + with pytest.raises(TypeError): + dirhash_impl(self.path_to('root'), 'sha256', protocol='') class SlowHasher(object): - wait_time = 0.1 + wait_time = 0.05 def __init__(self, *args, **kwargs): pass @@ -627,3 +832,36 @@ def update(self, data): def hexdigest(self): return '' + + +class IdentityHasher(object): + + def __init__(self, initial_data=b''): + self.datas = [initial_data.decode('utf-8')] + + def update(self, data): + self.datas.append(data.decode('utf-8')) + + def hexdigest(self): + return ''.join(self.datas) + + +class TestProtocol(object): + + def test_raise_for_invalid_entry_properties(self): + with pytest.raises(ValueError): + Protocol(entry_properties=['not-valid']) + + def test_raise_for_invalid_allow_cyclic_links(self): + with pytest.raises(ValueError): + Protocol(allow_cyclic_links='not-valid') + + +def mock_func(x): + return x * 2 + + +@pytest.mark.parametrize('jobs', [1, 2, 4]) +def test_parmap(jobs): + inputs = [1, 2, 3, 4] + assert _parmap(mock_func, inputs, jobs=jobs) == [2, 4, 6, 8]