diff --git a/README.md b/README.md index 2d96d16..fc2c0e3 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ [![codecov](https://codecov.io/gh/data61/blocklib/branch/master/graph/badge.svg)](https://codecov.io/gh/data61/blocklib) +[![Documentation Status](https://readthedocs.org/projects/blocklib/badge/?version=latest)](http://blocklib.readthedocs.io/en/latest/?badge=latest) ![Build Status](https://dev.azure.com/data61/Anonlink/_apis/build/status/data61.blocklib?branchName=master) [![Requirements Status](https://requires.io/github/data61/blocklib/requirements.svg?branch=master)](https://requires.io/github/data61/blocklib/requirements/?branch=master) [![Downloads](https://pepy.tech/badge/blocklib)](https://pepy.tech/project/blocklib) -#Blocklib +# Blocklib Python implementations of record linkage blocking techniques. Blocking is a technique that makes @@ -16,35 +17,35 @@ conducted to find which pairs of records should be linked. `blocklib` is part of the **Anonlink** project for privacy preserving record linkage. -###Installation +### Installation Install with pip: pip install blocklib -###Documents +### Documents -You can find comprehensive documentation and tutorials in folder `docs/`. +You can find comprehensive documentation and tutorials in [readthedocs](http://blocklib.readthedocs.io/en/latest) -###Tests +### Tests Run unit tests with `pytest`:: $ pytest -###Discussion +### Discussion -If you run into bugs, you can file them in our `issue tracker `__ +If you run into bugs, you can file them in our [issue tracker](https://github.com/data61/blocklib/issues) on GitHub. -There is also an `anonlink mailing list `__ +There is also an [anonlink mailing list](https://groups.google.com/forum/#!forum/anonlink) for development discussion and release announcements. -Wherever we interact, we strive to follow the `Python Community Code of Conduct `__. +Wherever we interact, we strive to follow the [Python Community Code of Conduct](https://www.python.org/psf/codeofconduct/) -###License and Copyright +### License and Copyright `blocklib` is copyright (c) Commonwealth Scientific and Industrial Research Organisation (CSIRO). diff --git a/blocklib/__init__.py b/blocklib/__init__.py index c47bb04..37b4a1d 100644 --- a/blocklib/__init__.py +++ b/blocklib/__init__.py @@ -10,6 +10,6 @@ from .evaluation import assess_blocks_2party try: - __version__ = pkg_resources.get_distribution('anonlinkclient').version + __version__ = pkg_resources.get_distribution('blocklib').version except pkg_resources.DistributionNotFound: __version__ = "development" \ No newline at end of file diff --git a/docs/blocking-schema.rst b/docs/blocking-schema.rst index d7eb419..99c85be 100644 --- a/docs/blocking-schema.rst +++ b/docs/blocking-schema.rst @@ -8,7 +8,7 @@ features to use in generating blocks and hyperparameters etc. Currently we support two blocking methods: -* "`p-sig`": Probability signature +* "`p-sig`": Probabilistic signature * "`lambda-fold`": LSH based :math:`\lambda`-fold @@ -82,8 +82,8 @@ Next we will detail the specific configuration for supported blocking methods. Specific configuration of supported blocking methods can be found here: -- `config of p-sig ` -- `config of lambda-fold ` +- :ref:`config of p-sig ` +- :ref:`config of lambda-fold ` .. _blocking-schema/p-sig: diff --git a/docs/development.rst b/docs/development.rst index dffefc6..aab2d5a 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -21,7 +21,7 @@ Type Checking ------------- -``anonlink-client`` uses static typechecking with ``mypy``. To run the type checker (in Python 3.5 or later):: +``blocklib`` uses static typechecking with ``mypy``. To run the type checker (in Python 3.5 or later):: $ pip install mypy $ mypy blocklib --ignore-missing-imports --strict-optional --no-implicit-optional --disallow-untyped-calls diff --git a/tests/test_simmeasure.py b/tests/test_simmeasure.py index 6de1913..8422c7f 100644 --- a/tests/test_simmeasure.py +++ b/tests/test_simmeasure.py @@ -1,6 +1,5 @@ from blocklib.simmeasure import EditSim, DiceSim - def test_editsim(): """Test Edit similarity measure.""" # with minimum threshold @@ -10,8 +9,11 @@ def test_editsim(): # without minimum threshold sim = EditSim({}) - score = sim.sim('Joyce', 'Jone') - assert score > 0 + score1 = sim.sim('Joyce', 'Joyyce') + score2 = sim.sim('Joyce', 'Jyoce') + + # Pair (Joyce, Joyyce) only need 1 deletion while Pair (Joyce, Jyoce) needs 2 substitution + assert score1 > score2 def test_dicesim(): @@ -19,5 +21,9 @@ def test_dicesim(): config = dict(ngram_len=2, ngram_padding=1, padding_start_char='a', padding_end_char='z') sim = DiceSim(config) - score = sim.sim('Joyce wears a T-shrt today', 'Jayce wear an T-shrt today', cache=True) - assert score > 0.8 \ No newline at end of file + # dice similarity should give higher score since it is based on bi-gram + s1 = 'JoJo isis' + s2 = 'Jo is' + score_dice = sim.sim(s1, s2, cache=True) + score_edit = EditSim({}).sim(s1, s2) + assert score_dice > score_edit \ No newline at end of file