Merge branch 'master' of github.com:datascopeanalytics/scrubadub

LeapBeyond · Oct 31, 2015 · c6a6e48 · c6a6e48
2 parents e7be528 + 1fbb132
commit c6a6e48
Show file tree

Hide file tree

Showing 35 changed files with 920 additions and 235 deletions.
diff --git a/design/basic_usage.py b/design/basic_usage.py
@@ -0,0 +1,22 @@
+"""This is the basic usage of the scrubadub module. It exposes three different
+methods for obfuscating personally identifiable information and uses high
+recall methods for identifying filth. Precision can be improved by further
+customization.
+"""
+
+import scrubadub
+
+# this should have very smart defaults, with high recall and relatively low
+# precision. the placeholder method is default and uses {{}} notation to
+# signify when text has been obfuscated
+clean_text = scrubadub.clean(text)
+clean_text = scrubadub.clean(text, replace_with="placeholder")
+
+# the surrogate replacement method makes it easy to replace phone numbers with
+# fake phone numbers, for example. this makes it easy to read the content
+clean_text = scrubadub.clean(text, replace_with="surrogate")
+
+# the identifier replacement method replaces the personal information
+# associated with each person in lookup with the same unique id to make it easy
+# to detect the same person across document records.
+clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
diff --git a/design/customize_filth_detection.py b/design/customize_filth_detection.py
@@ -0,0 +1,29 @@
+"""scrubadub has some very conservative defaults (high recall) for identifying
+filth. One of the key ways in which scrubadub can be customized is in improving
+the precision of filth detection.
+
+For example, if a user knows that the word 'iPhone' is not a person's name, but
+a product, then a user should be able to easily adapt how scrubadub identifies
+names.
+"""
+
+import scrubadub
+
+# fine-tune how scrubadub detects names and omit product names
+# https://github.com/deanmalmgren/scrubadub/issues/6
+class MyNameDetector(scrubadub.detectors.NameDetector):
+    def iter_filth(self, text):
+        for filth in super(MyNameDetector, self).iter_filth(text):
+            if filth != "iPhone":
+                yield filth
+
+# instantiate a scrubber and change the name detector to use our custom class
+scrubber = scrubadub.Scrubber()
+scrubber.detectors['name'] = MyNameDetector()
+
+# these methods have identical on a Scrubber object should have identical
+# behavior to the scrubadub.clean convenience function
+clean_text = scrubber.clean(text)
+clean_text = scrubber.clean(text, replace_with="placeholder")
+clean_text = scrubber.clean(text, replace_with="surrogate")
+clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)
diff --git a/design/customize_replacement_strings.py b/design/customize_replacement_strings.py
@@ -0,0 +1,44 @@
+"""scrubadub uses {{}} notation by default to identify filth, but a user may
+prefer to fine-tune how the filth is removed.
+
+For example, if the input text is html, then a user may want the filth to be
+included in a <span> tag that has a particular class on it to make it easy to
+style these things.
+
+Another example is a situation when a user wants to retain the domain name on a
+URL but not the path.
+"""
+
+import scrubadub
+
+# fine tune the prefix and suffix for all scrubadub objects. because this is
+# changing a class attribute on the base class, this should propagate to all
+# filth
+scrubadub.filth.Filth.prefix = '<span class="scrubadub filth">'
+scrubadub.filth.Filth.suffix = '</span>'
+
+# these methods should now all have that prefix and suffix
+clean_text = scrubadub.clean(text)
+clean_text = scrubadub.clean(text, replace_with="placeholder")
+clean_text = scrubadub.clean(text, replace_with="surrogate")
+clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
+
+# and so should these
+scrubber = scrubadub.Scrubber()
+clean_text = scrubber.clean(text)
+clean_text = scrubber.clean(text, replace_with="placeholder")
+clean_text = scrubber.clean(text, replace_with="surrogate")
+clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)
+
+
+# reconfigure back to the old prefix and suffix combination and now keep the
+# domain on UrlFilth
+scrubadub.filth.Filth.prefix = '{{'
+scrubadub.filth.Filth.suffix = '}}'
+scrubadub.filth.UrlFilth.keep_domain = True
+
+# these methods should now all have that prefix and suffix
+clean_text = scrubadub.clean(text)
+clean_text = scrubadub.clean(text, replace_with="placeholder")
+clean_text = scrubadub.clean(text, replace_with="surrogate")
+clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
diff --git a/design/customize_via_training.py b/design/customize_via_training.py
@@ -0,0 +1,45 @@
+"""scrubadub currently removes personally identifiable information with some
+regular expression and natural language processing techniques. These techniques
+work very well in a wide range of circumstances, but they also tend to make
+mistakes.
+
+For example, the first sentence should obfuscate the name 'April' and
+the second sentence should not obfuscate the month 'April'.
+
+April is a good friend of mine. I hope to see her in April.
+
+To make this possible, scrubadub needs to be able to incorporate some
+techniques for training a classifier to identify filth. The training interface
+is important and probably not something that is best done in a terminal, but it
+is important that the technical infrastructure is there for it to work.
+"""
+
+import scrubadub
+
+# a TrainedScrubber can be taught what is dirty about a particular document.
+scrubber = scrubadub.TrainedScrubber()
+for document in training_documents:
+
+    # TrainedScrubber.detect_filth just returns a list of filth objects that
+    # are returned by Scrubber.iter_filth. This is used to help make
+    # classification easy for end users.
+    filth_list = scrubber.detect_filth(document)
+
+    # The filth_list is then refined by human input. It is very difficult to
+    # imagine doing this in a terminal in an effective way (although `git add
+    # -i` might be a decent example). I imagine that person_identifies_filth is
+    # a web interface where users can easily brush text to improve recall and
+    # adjust the preliminary filth_list to improve precision.
+    filth_list = person_identifies_filth(document, filth_list)
+
+    # The TrainedScrubber.train method should incorporate the filth_list into
+    # its classifier and further return a cleaned document with the filth
+    # removed in an appropriate way.
+    cleaned_document = scrubber.train(document, filth_list)
+
+# the TrainedScrubber.predict (or maybe just TrainedScrubber.clean?) method is
+# then used to use the classifier to selectively clean filth based on the human
+# input. This way, you might only have to train ~1000 documents to do a good
+# job of scrubbing the rest (imagine having to do this for 1mm documents)
+for document in test_documents:
+    clean_document = scrubber.predict(document)
diff --git a/docs/index.rst b/docs/index.rst
@@ -39,17 +39,17 @@ incorporating it into your python scripts like this:
 
     # Replace names with {{NAME}} placeholder. This is the scrubadub default
     # because it maximally omits any information about people.
-    >>> placeholder_text = scrubadub.clean_with_placeholders(text)
+    >>> placeholder_text = scrubadub.clean(text)
     >>> placeholder_text
     u"{{NAME}} is a cat"
 
 ..    # Replace names with {{NAME-ID}} anonymous, but consistent IDs.
-    >>> identifier_text = scrubadub.clean_with_identifiers(text)
+    >>> identifier_text = scrubadub.clean(text, replace_with='identifier')
     >>> identifier_text
     u"{{NAME-1287}} is a cat"
 
 ..    # Replace names with random, gender-consistent names
-    >>> surrogate_text = scrubadub.clean_with_surrogates(text)
+    >>> surrogate_text = scrubadub.clean(text, replace_with='surrogate')
     >>> surrogate_text
     u"Billy is a cat"
 
@@ -62,7 +62,7 @@ incorporating it into your python scripts like this:
     ...         return text
     ...
     >>> text = u"John's email address is cat@gmail.com"
-    >>> text = scrubadub.clean_with_placeholders(text, cls=NoEmailScrubber)
+    >>> text = scrubadub.clean(text, cls=NoEmailScrubber)
     >>> text
     u"{{NAME}}'s email address is cat@gmail.com'"
 

diff --git a/requirements/python-dev b/requirements/python-dev
@@ -13,3 +13,6 @@ nose
 # for documentation
 sphinx
 sphinx_rtd_theme
+
+# for convenience
+ipdb
diff --git a/scrubadub/__init__.py b/scrubadub/__init__.py
@@ -1,15 +1,16 @@
 
 # convenient imports
 from .scrubbers import Scrubber
-
+from . import filth
+from . import detectors
 
 __version__ = VERSION = "0.1.0"
 
 
-def clean_with_placeholders(text, cls=None):
+def clean(text, cls=None, **kwargs):
     """Public facing function to clean ``text`` using the scrubber ``cls`` by
     replacing all personal information with ``{{PLACEHOLDERS}}``.
     """
     cls = cls or Scrubber
     scrubber = cls()
-    return scrubber.clean_with_placeholders(text)
+    return scrubber.clean(text, **kwargs)
diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py
@@ -0,0 +1,17 @@
+from .name import NameDetector
+from .email import EmailDetector
+from .url import UrlDetector
+from .phone import PhoneDetector
+from .credential import CredentialDetector
+from .skype import SkypeDetector
+
+
+# convenience object for instantiating all of the detectors at once
+types = {
+    "name": NameDetector,
+    "email": EmailDetector,
+    "url": UrlDetector,
+    "phone": PhoneDetector,
+    "credential": CredentialDetector,
+    "skype": SkypeDetector,
+}
diff --git a/scrubadub/detectors/base.py b/scrubadub/detectors/base.py
@@ -0,0 +1,23 @@
+import re
+
+from .. import exceptions
+from ..filth import RegexFilth
+
+
+class Detector(object):
+    def iter_filth(self, text):
+        raise NotImplementedError('must be overridden by base classes')
+
+
+class RegexDetector(Detector):
+    filth_cls = RegexFilth
+
+    def iter_filth(self, text):
+        if not issubclass(self.filth_cls, RegexFilth):
+            raise exceptions.UnexpectedFilth(
+                'RegexFilth required for RegexDetector'
+            )
+        if self.filth_cls.regex is None:
+            raise StopIteration
+        for match in self.filth_cls.regex.finditer(text):
+            yield self.filth_cls(match)
diff --git a/scrubadub/detectors/credential.py b/scrubadub/detectors/credential.py
@@ -0,0 +1,9 @@
+
+from .base import RegexDetector
+from ..filth import CredentialFilth
+
+
+class CredentialDetector(RegexDetector):
+    """Remove username/password combinations from dirty drity ``text``.
+    """
+    filth_cls = CredentialFilth
diff --git a/scrubadub/detectors/email.py b/scrubadub/detectors/email.py
@@ -0,0 +1,12 @@
+import re
+
+from .base import RegexDetector
+from ..filth import EmailFilth
+
+
+class EmailDetector(RegexDetector):
+    """Use regular expression magic to remove email addresses from dirty
+    dirty ``text``. This method also catches email addresses like ``john at
+    gmail.com``.
+    """
+    filth_cls = EmailFilth
diff --git a/scrubadub/detectors/name.py b/scrubadub/detectors/name.py
@@ -0,0 +1,44 @@
+import re
+
+import textblob
+
+from .base import RegexDetector
+from ..filth import NameFilth
+from ..utils import CanonicalStringSet
+
+
+class NameDetector(RegexDetector):
+    """Use part of speech tagging to clean proper nouns out of the dirty dirty
+    ``text``. Disallow particular nouns by adding them to the
+    ``NameDetector.disallowed_nouns`` set.
+    """
+    filth_cls = NameFilth
+
+    disallowed_nouns = CanonicalStringSet(["skype"])
+
+    def iter_filth(self, text):
+
+        if not isinstance(self.disallowed_nouns, CanonicalStringSet):
+            raise TypeError(
+                'NameDetector.disallowed_nouns must be CanonicalStringSet'
+            )
+
+        # find the set of proper nouns using textblob.
+        proper_nouns = set()
+        blob = textblob.TextBlob(text)
+        for word, part_of_speech in blob.tags:
+            is_proper_noun = part_of_speech in ("NNP", "NNPS")
+            if is_proper_noun and word.lower() not in self.disallowed_nouns:
+                proper_nouns.add(word)
+
+        # use a regex to replace the proper nouns by first escaping any
+        # lingering punctuation in the regex
+        # http://stackoverflow.com/a/4202559/564709
+        if proper_nouns:
+            re_list = []
+            for proper_noun in proper_nouns:
+                re_list.append(r'\b' + re.escape(proper_noun) + r'\b')
+            self.filth_cls.regex = re.compile('|'.join(re_list))
+        else:
+            self.filth_cls.regex = None
+        return super(NameDetector, self).iter_filth(text)
diff --git a/scrubadub/detectors/phone.py b/scrubadub/detectors/phone.py
@@ -0,0 +1,26 @@
+import phonenumbers
+
+from .base import Detector
+from ..filth import PhoneFilth
+
+
+class PhoneDetector(Detector):
+    """Remove phone numbers from dirty dirty ``text`` using
+    `python-phonenumbers
+    <https://github.com/daviddrysdale/python-phonenumbers>`_, a port of a
+    Google project to correctly format phone numbers in text.
+
+    ``region`` specifies the best guess region to start with (default:
+    ``"US"``). Specify ``None`` to only consider numbers with a leading
+    ``+`` to be considered.
+    """
+    region = 'US'
+
+    def iter_filth(self, text):
+        # create a copy of text to handle multiple phone numbers correctly
+        for match in phonenumbers.PhoneNumberMatcher(text, self.region):
+            yield PhoneFilth(
+                beg=match.start,
+                end=match.end,
+                text=match.raw_string,
+            )