From d606c9aa5fa8620597afc00a33ab6386148b633d Mon Sep 17 00:00:00 2001 From: Brian Thorne Date: Tue, 25 Apr 2023 09:49:14 +1200 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9D=20Update=20docs=20and=20set=20vers?= =?UTF-8?q?ion=20to=200.18.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 7 +++++++ README.md | 18 ++++++++++++++---- clkhash/bloomfilter.py | 26 ++++++++++---------------- clkhash/validate_data.py | 4 ++-- pyproject.toml | 2 +- 5 files changed, 34 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21eac8e8..3a6ab3e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ ## new version +## 0.18.0 + +- Performance improvements by caching hashes of tokens. #664 +- Switch to using `blakeHash` for benchmarking. #664 +- Remove implicit dependency on `setuptools`. #663 +- Migrate to pyproject.toml for dependency management and packaging. #659 + ## 0.17.0 - Remove use of bitarray fork as upstream project now publishes wheels. #557, #567, #573 diff --git a/README.md b/README.md index efe8cde7..32a350ac 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # CLK Hash -Python implementation of cryptographic longterm key hashing as described by Rainer Schnell, Tobias Bachteler, and Jörg Reiher in -[A Novel Error-Tolerant Anonymous Linking Code](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3549247). +

+ Clkhash Logo +

-`clkhash` supports Python versions 3.6+ +
[![codecov](https://codecov.io/gh/data61/clkhash/branch/master/graph/badge.svg)](https://codecov.io/gh/data61/clkhash) [![Documentation Status](https://readthedocs.org/projects/clkhash/badge/?version=latest)](http://clkhash.readthedocs.io/en/latest/?badge=latest) @@ -11,6 +12,11 @@ Python implementation of cryptographic longterm key hashing as described by Rain [![Typechecking](https://github.com/data61/clkhash/actions/workflows/typechecking.yml/badge.svg)](https://github.com/data61/clkhash/actions/workflows/typechecking.yml) [![Downloads](https://pepy.tech/badge/clkhash)](https://pepy.tech/project/clkhash) +
+ +**clkhash** is a Python implementation of cryptographic linkage key hashing as described by _Rainer Schnell, Tobias Bachteler, and Jörg Reiher_ in +[A Novel Error-Tolerant Anonymous Linking Code](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3549247). + ## Installation Install clkhash with all dependencies using pip: @@ -23,7 +29,7 @@ Install clkhash with all dependencies using pip: [https://clkhash.readthedocs.io](https://clkhash.readthedocs.io/en/latest/) -## clkhash api +## Python API To hash a CSV file of entities using the default schema: @@ -33,6 +39,10 @@ fake_pii_schema = randomnames.NameList.SCHEMA clks = clk.generate_clk_from_csv(open('fake-pii-out.csv','r'), 'secret', fake_pii_schema) ``` +## Command Line Interface + +See [Anonlink Client](https://github.com/data61/anonlink-client) for a command line interface to clkhash. + ## Citing Clkhash, and the wider Anonlink project is designed, developed and supported by diff --git a/clkhash/bloomfilter.py b/clkhash/bloomfilter.py index 24f3e8c9..6f42f71e 100644 --- a/clkhash/bloomfilter.py +++ b/clkhash/bloomfilter.py @@ -37,10 +37,7 @@ def double_hash_encode_ngrams(ngrams: Iterable[str], ) -> bitarray: """ Computes the double hash encoding of the ngrams with the given keys. - Using the method from: - Schnell, R., Bachteler, T., & Reiher, J. (2011). - A Novel Error-Tolerant Anonymous Linking Code. - http://grlc.german-microsimulation.de/wp-content/uploads/2017/05/downloadwp-grlc-2011-02.pdf + Using the method from [Schnell2011]_. :param ngrams: list of n-grams to be encoded :param keys: hmac secret keys for md5 and sha1 as bytes @@ -60,7 +57,7 @@ def double_hash_encode_ngrams_non_singular(ngrams: Iterable[str], l: int, encoding: str ) -> bitarray: - """ computes the double hash encoding of the n-grams with the given keys. + """ Computes the double hash encoding of the n-grams with the given keys. The original construction of [Schnell2011]_ displays an abnormality for certain inputs: @@ -108,7 +105,7 @@ def _double_hash_encode_ngrams(ngrams: Tuple[str, ...], ks: Tuple[int, ...], l: int, encoding: str, - non_singular + non_singular: bool ) -> bitarray: key_sha1, key_md5 = keys bf = bitarray(l) @@ -117,9 +114,9 @@ def _double_hash_encode_ngrams(ngrams: Tuple[str, ...], for m, k in zip(ngrams, ks): m_bytes = m.encode(encoding=encoding) if non_singular: - md5hm, sha1hm = _double_hash_token_non_singular(m.encode(encoding=encoding), l, key_sha1, key_md5) + md5hm, sha1hm = _double_hash_token_non_singular(m_bytes, l, key_sha1, key_md5) else: - md5hm, sha1hm = _double_hash_token(m.encode(encoding=encoding), l, key_sha1, key_md5) + md5hm, sha1hm = _double_hash_token(m_bytes, l, key_sha1, key_md5) for i in range(k): gi = (sha1hm + i * md5hm) % l bf[gi] = 1 @@ -160,11 +157,9 @@ def blake_encode_ngrams(ngrams: Iterable[str], ) -> bitarray: """ Computes the encoding of the ngrams using the BLAKE2 hash function. - We deliberately do not use the double hashing scheme as proposed in [ - Schnell2011]_, because this - would introduce an exploitable structure into the Bloom filter. For more - details on the - weakness, see [Kroll2015]_. + We deliberately do not use the double hashing scheme as proposed in + [Schnell2011]_, because this would introduce an exploitable structure + into the Bloom filter. For more details on the weakness, see [Kroll2015]_. In short, the double hashing scheme only allows for :math:`l^2` different encodings for any possible n-gram, @@ -318,13 +313,12 @@ def crypto_bloom_filter(record: Sequence[str], ) -> Tuple[bitarray, str, int]: """ Computes the composite Bloom filter encoding of a record. - Using the method from - http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf + Based on the method from [Schnell2011]_. :param record: plaintext record tuple. E.g. (index, name, dob, gender) :param comparators: A list of comparators. They provide a 'tokenize' function to turn string into appropriate tokens. - :param schema: Schema + :param schema: The Linkage Schema describing how to encode plaintext identifiers. :param keys: Keys for the hash functions as a tuple of lists of bytes. :return: 3-tuple: diff --git a/clkhash/validate_data.py b/clkhash/validate_data.py index 1846dc54..2d11ecf9 100644 --- a/clkhash/validate_data.py +++ b/clkhash/validate_data.py @@ -12,8 +12,8 @@ class EntryError(ValueError): """ An entry is invalid. """ - row_index = None # type: Optional[int] - field_spec = None # type: Optional[FieldSpec] + row_index: Optional[int] = None + field_spec: Optional[FieldSpec] = None class FormatError(ValueError): diff --git a/pyproject.toml b/pyproject.toml index 0ccd86e4..b912517f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "clkhash" -version = "0.17.1" +version = "0.18.0" description = "Encoding utility to create Cryptographic Linkage Keys" license = "Apache" authors = ["Brian Thorne", "Wilko Henecka", "Guillaume Smith"]