Skip to content

Commit

Permalink
Merge branch 'master' of github.com:datascopeanalytics/scrubadub
Browse files Browse the repository at this point in the history
  • Loading branch information
Dean Malmgren committed Oct 31, 2015
2 parents e7be528 + 1fbb132 commit c6a6e48
Show file tree
Hide file tree
Showing 35 changed files with 920 additions and 235 deletions.
22 changes: 22 additions & 0 deletions design/basic_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""This is the basic usage of the scrubadub module. It exposes three different
methods for obfuscating personally identifiable information and uses high
recall methods for identifying filth. Precision can be improved by further
customization.
"""

import scrubadub

# this should have very smart defaults, with high recall and relatively low
# precision. the placeholder method is default and uses {{}} notation to
# signify when text has been obfuscated
clean_text = scrubadub.clean(text)
clean_text = scrubadub.clean(text, replace_with="placeholder")

# the surrogate replacement method makes it easy to replace phone numbers with
# fake phone numbers, for example. this makes it easy to read the content
clean_text = scrubadub.clean(text, replace_with="surrogate")

# the identifier replacement method replaces the personal information
# associated with each person in lookup with the same unique id to make it easy
# to detect the same person across document records.
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
29 changes: 29 additions & 0 deletions design/customize_filth_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""scrubadub has some very conservative defaults (high recall) for identifying
filth. One of the key ways in which scrubadub can be customized is in improving
the precision of filth detection.
For example, if a user knows that the word 'iPhone' is not a person's name, but
a product, then a user should be able to easily adapt how scrubadub identifies
names.
"""

import scrubadub

# fine-tune how scrubadub detects names and omit product names
# https://github.com/deanmalmgren/scrubadub/issues/6
class MyNameDetector(scrubadub.detectors.NameDetector):
def iter_filth(self, text):
for filth in super(MyNameDetector, self).iter_filth(text):
if filth != "iPhone":
yield filth

# instantiate a scrubber and change the name detector to use our custom class
scrubber = scrubadub.Scrubber()
scrubber.detectors['name'] = MyNameDetector()

# these methods have identical on a Scrubber object should have identical
# behavior to the scrubadub.clean convenience function
clean_text = scrubber.clean(text)
clean_text = scrubber.clean(text, replace_with="placeholder")
clean_text = scrubber.clean(text, replace_with="surrogate")
clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)
44 changes: 44 additions & 0 deletions design/customize_replacement_strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""scrubadub uses {{}} notation by default to identify filth, but a user may
prefer to fine-tune how the filth is removed.
For example, if the input text is html, then a user may want the filth to be
included in a <span> tag that has a particular class on it to make it easy to
style these things.
Another example is a situation when a user wants to retain the domain name on a
URL but not the path.
"""

import scrubadub

# fine tune the prefix and suffix for all scrubadub objects. because this is
# changing a class attribute on the base class, this should propagate to all
# filth
scrubadub.filth.Filth.prefix = '<span class="scrubadub filth">'
scrubadub.filth.Filth.suffix = '</span>'

# these methods should now all have that prefix and suffix
clean_text = scrubadub.clean(text)
clean_text = scrubadub.clean(text, replace_with="placeholder")
clean_text = scrubadub.clean(text, replace_with="surrogate")
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)

# and so should these
scrubber = scrubadub.Scrubber()
clean_text = scrubber.clean(text)
clean_text = scrubber.clean(text, replace_with="placeholder")
clean_text = scrubber.clean(text, replace_with="surrogate")
clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)


# reconfigure back to the old prefix and suffix combination and now keep the
# domain on UrlFilth
scrubadub.filth.Filth.prefix = '{{'
scrubadub.filth.Filth.suffix = '}}'
scrubadub.filth.UrlFilth.keep_domain = True

# these methods should now all have that prefix and suffix
clean_text = scrubadub.clean(text)
clean_text = scrubadub.clean(text, replace_with="placeholder")
clean_text = scrubadub.clean(text, replace_with="surrogate")
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
45 changes: 45 additions & 0 deletions design/customize_via_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""scrubadub currently removes personally identifiable information with some
regular expression and natural language processing techniques. These techniques
work very well in a wide range of circumstances, but they also tend to make
mistakes.
For example, the first sentence should obfuscate the name 'April' and
the second sentence should not obfuscate the month 'April'.
April is a good friend of mine. I hope to see her in April.
To make this possible, scrubadub needs to be able to incorporate some
techniques for training a classifier to identify filth. The training interface
is important and probably not something that is best done in a terminal, but it
is important that the technical infrastructure is there for it to work.
"""

import scrubadub

# a TrainedScrubber can be taught what is dirty about a particular document.
scrubber = scrubadub.TrainedScrubber()
for document in training_documents:

# TrainedScrubber.detect_filth just returns a list of filth objects that
# are returned by Scrubber.iter_filth. This is used to help make
# classification easy for end users.
filth_list = scrubber.detect_filth(document)

# The filth_list is then refined by human input. It is very difficult to
# imagine doing this in a terminal in an effective way (although `git add
# -i` might be a decent example). I imagine that person_identifies_filth is
# a web interface where users can easily brush text to improve recall and
# adjust the preliminary filth_list to improve precision.
filth_list = person_identifies_filth(document, filth_list)

# The TrainedScrubber.train method should incorporate the filth_list into
# its classifier and further return a cleaned document with the filth
# removed in an appropriate way.
cleaned_document = scrubber.train(document, filth_list)

# the TrainedScrubber.predict (or maybe just TrainedScrubber.clean?) method is
# then used to use the classifier to selectively clean filth based on the human
# input. This way, you might only have to train ~1000 documents to do a good
# job of scrubbing the rest (imagine having to do this for 1mm documents)
for document in test_documents:
clean_document = scrubber.predict(document)
8 changes: 4 additions & 4 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,17 @@ incorporating it into your python scripts like this:
# Replace names with {{NAME}} placeholder. This is the scrubadub default
# because it maximally omits any information about people.
>>> placeholder_text = scrubadub.clean_with_placeholders(text)
>>> placeholder_text = scrubadub.clean(text)
>>> placeholder_text
u"{{NAME}} is a cat"
.. # Replace names with {{NAME-ID}} anonymous, but consistent IDs.
>>> identifier_text = scrubadub.clean_with_identifiers(text)
>>> identifier_text = scrubadub.clean(text, replace_with='identifier')
>>> identifier_text
u"{{NAME-1287}} is a cat"
.. # Replace names with random, gender-consistent names
>>> surrogate_text = scrubadub.clean_with_surrogates(text)
>>> surrogate_text = scrubadub.clean(text, replace_with='surrogate')
>>> surrogate_text
u"Billy is a cat"
Expand All @@ -62,7 +62,7 @@ incorporating it into your python scripts like this:
... return text
...
>>> text = u"John's email address is cat@gmail.com"
>>> text = scrubadub.clean_with_placeholders(text, cls=NoEmailScrubber)
>>> text = scrubadub.clean(text, cls=NoEmailScrubber)
>>> text
u"{{NAME}}'s email address is cat@gmail.com'"
Expand Down
3 changes: 3 additions & 0 deletions requirements/python-dev
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ nose
# for documentation
sphinx
sphinx_rtd_theme

# for convenience
ipdb
7 changes: 4 additions & 3 deletions scrubadub/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@

# convenient imports
from .scrubbers import Scrubber

from . import filth
from . import detectors

__version__ = VERSION = "0.1.0"


def clean_with_placeholders(text, cls=None):
def clean(text, cls=None, **kwargs):
"""Public facing function to clean ``text`` using the scrubber ``cls`` by
replacing all personal information with ``{{PLACEHOLDERS}}``.
"""
cls = cls or Scrubber
scrubber = cls()
return scrubber.clean_with_placeholders(text)
return scrubber.clean(text, **kwargs)
17 changes: 17 additions & 0 deletions scrubadub/detectors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from .name import NameDetector
from .email import EmailDetector
from .url import UrlDetector
from .phone import PhoneDetector
from .credential import CredentialDetector
from .skype import SkypeDetector


# convenience object for instantiating all of the detectors at once
types = {
"name": NameDetector,
"email": EmailDetector,
"url": UrlDetector,
"phone": PhoneDetector,
"credential": CredentialDetector,
"skype": SkypeDetector,
}
23 changes: 23 additions & 0 deletions scrubadub/detectors/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import re

from .. import exceptions
from ..filth import RegexFilth


class Detector(object):
def iter_filth(self, text):
raise NotImplementedError('must be overridden by base classes')


class RegexDetector(Detector):
filth_cls = RegexFilth

def iter_filth(self, text):
if not issubclass(self.filth_cls, RegexFilth):
raise exceptions.UnexpectedFilth(
'RegexFilth required for RegexDetector'
)
if self.filth_cls.regex is None:
raise StopIteration
for match in self.filth_cls.regex.finditer(text):
yield self.filth_cls(match)
9 changes: 9 additions & 0 deletions scrubadub/detectors/credential.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

from .base import RegexDetector
from ..filth import CredentialFilth


class CredentialDetector(RegexDetector):
"""Remove username/password combinations from dirty drity ``text``.
"""
filth_cls = CredentialFilth
12 changes: 12 additions & 0 deletions scrubadub/detectors/email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re

from .base import RegexDetector
from ..filth import EmailFilth


class EmailDetector(RegexDetector):
"""Use regular expression magic to remove email addresses from dirty
dirty ``text``. This method also catches email addresses like ``john at
gmail.com``.
"""
filth_cls = EmailFilth
44 changes: 44 additions & 0 deletions scrubadub/detectors/name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import re

import textblob

from .base import RegexDetector
from ..filth import NameFilth
from ..utils import CanonicalStringSet


class NameDetector(RegexDetector):
"""Use part of speech tagging to clean proper nouns out of the dirty dirty
``text``. Disallow particular nouns by adding them to the
``NameDetector.disallowed_nouns`` set.
"""
filth_cls = NameFilth

disallowed_nouns = CanonicalStringSet(["skype"])

def iter_filth(self, text):

if not isinstance(self.disallowed_nouns, CanonicalStringSet):
raise TypeError(
'NameDetector.disallowed_nouns must be CanonicalStringSet'
)

# find the set of proper nouns using textblob.
proper_nouns = set()
blob = textblob.TextBlob(text)
for word, part_of_speech in blob.tags:
is_proper_noun = part_of_speech in ("NNP", "NNPS")
if is_proper_noun and word.lower() not in self.disallowed_nouns:
proper_nouns.add(word)

# use a regex to replace the proper nouns by first escaping any
# lingering punctuation in the regex
# http://stackoverflow.com/a/4202559/564709
if proper_nouns:
re_list = []
for proper_noun in proper_nouns:
re_list.append(r'\b' + re.escape(proper_noun) + r'\b')
self.filth_cls.regex = re.compile('|'.join(re_list))
else:
self.filth_cls.regex = None
return super(NameDetector, self).iter_filth(text)
26 changes: 26 additions & 0 deletions scrubadub/detectors/phone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import phonenumbers

from .base import Detector
from ..filth import PhoneFilth


class PhoneDetector(Detector):
"""Remove phone numbers from dirty dirty ``text`` using
`python-phonenumbers
<https://github.com/daviddrysdale/python-phonenumbers>`_, a port of a
Google project to correctly format phone numbers in text.
``region`` specifies the best guess region to start with (default:
``"US"``). Specify ``None`` to only consider numbers with a leading
``+`` to be considered.
"""
region = 'US'

def iter_filth(self, text):
# create a copy of text to handle multiple phone numbers correctly
for match in phonenumbers.PhoneNumberMatcher(text, self.region):
yield PhoneFilth(
beg=match.start,
end=match.end,
text=match.raw_string,
)
Loading

0 comments on commit c6a6e48

Please sign in to comment.