-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:datascopeanalytics/scrubadub
- Loading branch information
Showing
35 changed files
with
920 additions
and
235 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
"""This is the basic usage of the scrubadub module. It exposes three different | ||
methods for obfuscating personally identifiable information and uses high | ||
recall methods for identifying filth. Precision can be improved by further | ||
customization. | ||
""" | ||
|
||
import scrubadub | ||
|
||
# this should have very smart defaults, with high recall and relatively low | ||
# precision. the placeholder method is default and uses {{}} notation to | ||
# signify when text has been obfuscated | ||
clean_text = scrubadub.clean(text) | ||
clean_text = scrubadub.clean(text, replace_with="placeholder") | ||
|
||
# the surrogate replacement method makes it easy to replace phone numbers with | ||
# fake phone numbers, for example. this makes it easy to read the content | ||
clean_text = scrubadub.clean(text, replace_with="surrogate") | ||
|
||
# the identifier replacement method replaces the personal information | ||
# associated with each person in lookup with the same unique id to make it easy | ||
# to detect the same person across document records. | ||
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
"""scrubadub has some very conservative defaults (high recall) for identifying | ||
filth. One of the key ways in which scrubadub can be customized is in improving | ||
the precision of filth detection. | ||
For example, if a user knows that the word 'iPhone' is not a person's name, but | ||
a product, then a user should be able to easily adapt how scrubadub identifies | ||
names. | ||
""" | ||
|
||
import scrubadub | ||
|
||
# fine-tune how scrubadub detects names and omit product names | ||
# https://github.com/deanmalmgren/scrubadub/issues/6 | ||
class MyNameDetector(scrubadub.detectors.NameDetector): | ||
def iter_filth(self, text): | ||
for filth in super(MyNameDetector, self).iter_filth(text): | ||
if filth != "iPhone": | ||
yield filth | ||
|
||
# instantiate a scrubber and change the name detector to use our custom class | ||
scrubber = scrubadub.Scrubber() | ||
scrubber.detectors['name'] = MyNameDetector() | ||
|
||
# these methods have identical on a Scrubber object should have identical | ||
# behavior to the scrubadub.clean convenience function | ||
clean_text = scrubber.clean(text) | ||
clean_text = scrubber.clean(text, replace_with="placeholder") | ||
clean_text = scrubber.clean(text, replace_with="surrogate") | ||
clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
"""scrubadub uses {{}} notation by default to identify filth, but a user may | ||
prefer to fine-tune how the filth is removed. | ||
For example, if the input text is html, then a user may want the filth to be | ||
included in a <span> tag that has a particular class on it to make it easy to | ||
style these things. | ||
Another example is a situation when a user wants to retain the domain name on a | ||
URL but not the path. | ||
""" | ||
|
||
import scrubadub | ||
|
||
# fine tune the prefix and suffix for all scrubadub objects. because this is | ||
# changing a class attribute on the base class, this should propagate to all | ||
# filth | ||
scrubadub.filth.Filth.prefix = '<span class="scrubadub filth">' | ||
scrubadub.filth.Filth.suffix = '</span>' | ||
|
||
# these methods should now all have that prefix and suffix | ||
clean_text = scrubadub.clean(text) | ||
clean_text = scrubadub.clean(text, replace_with="placeholder") | ||
clean_text = scrubadub.clean(text, replace_with="surrogate") | ||
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup) | ||
|
||
# and so should these | ||
scrubber = scrubadub.Scrubber() | ||
clean_text = scrubber.clean(text) | ||
clean_text = scrubber.clean(text, replace_with="placeholder") | ||
clean_text = scrubber.clean(text, replace_with="surrogate") | ||
clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup) | ||
|
||
|
||
# reconfigure back to the old prefix and suffix combination and now keep the | ||
# domain on UrlFilth | ||
scrubadub.filth.Filth.prefix = '{{' | ||
scrubadub.filth.Filth.suffix = '}}' | ||
scrubadub.filth.UrlFilth.keep_domain = True | ||
|
||
# these methods should now all have that prefix and suffix | ||
clean_text = scrubadub.clean(text) | ||
clean_text = scrubadub.clean(text, replace_with="placeholder") | ||
clean_text = scrubadub.clean(text, replace_with="surrogate") | ||
clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
"""scrubadub currently removes personally identifiable information with some | ||
regular expression and natural language processing techniques. These techniques | ||
work very well in a wide range of circumstances, but they also tend to make | ||
mistakes. | ||
For example, the first sentence should obfuscate the name 'April' and | ||
the second sentence should not obfuscate the month 'April'. | ||
April is a good friend of mine. I hope to see her in April. | ||
To make this possible, scrubadub needs to be able to incorporate some | ||
techniques for training a classifier to identify filth. The training interface | ||
is important and probably not something that is best done in a terminal, but it | ||
is important that the technical infrastructure is there for it to work. | ||
""" | ||
|
||
import scrubadub | ||
|
||
# a TrainedScrubber can be taught what is dirty about a particular document. | ||
scrubber = scrubadub.TrainedScrubber() | ||
for document in training_documents: | ||
|
||
# TrainedScrubber.detect_filth just returns a list of filth objects that | ||
# are returned by Scrubber.iter_filth. This is used to help make | ||
# classification easy for end users. | ||
filth_list = scrubber.detect_filth(document) | ||
|
||
# The filth_list is then refined by human input. It is very difficult to | ||
# imagine doing this in a terminal in an effective way (although `git add | ||
# -i` might be a decent example). I imagine that person_identifies_filth is | ||
# a web interface where users can easily brush text to improve recall and | ||
# adjust the preliminary filth_list to improve precision. | ||
filth_list = person_identifies_filth(document, filth_list) | ||
|
||
# The TrainedScrubber.train method should incorporate the filth_list into | ||
# its classifier and further return a cleaned document with the filth | ||
# removed in an appropriate way. | ||
cleaned_document = scrubber.train(document, filth_list) | ||
|
||
# the TrainedScrubber.predict (or maybe just TrainedScrubber.clean?) method is | ||
# then used to use the classifier to selectively clean filth based on the human | ||
# input. This way, you might only have to train ~1000 documents to do a good | ||
# job of scrubbing the rest (imagine having to do this for 1mm documents) | ||
for document in test_documents: | ||
clean_document = scrubber.predict(document) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,3 +13,6 @@ nose | |
# for documentation | ||
sphinx | ||
sphinx_rtd_theme | ||
|
||
# for convenience | ||
ipdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,16 @@ | ||
|
||
# convenient imports | ||
from .scrubbers import Scrubber | ||
|
||
from . import filth | ||
from . import detectors | ||
|
||
__version__ = VERSION = "0.1.0" | ||
|
||
|
||
def clean_with_placeholders(text, cls=None): | ||
def clean(text, cls=None, **kwargs): | ||
"""Public facing function to clean ``text`` using the scrubber ``cls`` by | ||
replacing all personal information with ``{{PLACEHOLDERS}}``. | ||
""" | ||
cls = cls or Scrubber | ||
scrubber = cls() | ||
return scrubber.clean_with_placeholders(text) | ||
return scrubber.clean(text, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from .name import NameDetector | ||
from .email import EmailDetector | ||
from .url import UrlDetector | ||
from .phone import PhoneDetector | ||
from .credential import CredentialDetector | ||
from .skype import SkypeDetector | ||
|
||
|
||
# convenience object for instantiating all of the detectors at once | ||
types = { | ||
"name": NameDetector, | ||
"email": EmailDetector, | ||
"url": UrlDetector, | ||
"phone": PhoneDetector, | ||
"credential": CredentialDetector, | ||
"skype": SkypeDetector, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import re | ||
|
||
from .. import exceptions | ||
from ..filth import RegexFilth | ||
|
||
|
||
class Detector(object): | ||
def iter_filth(self, text): | ||
raise NotImplementedError('must be overridden by base classes') | ||
|
||
|
||
class RegexDetector(Detector): | ||
filth_cls = RegexFilth | ||
|
||
def iter_filth(self, text): | ||
if not issubclass(self.filth_cls, RegexFilth): | ||
raise exceptions.UnexpectedFilth( | ||
'RegexFilth required for RegexDetector' | ||
) | ||
if self.filth_cls.regex is None: | ||
raise StopIteration | ||
for match in self.filth_cls.regex.finditer(text): | ||
yield self.filth_cls(match) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
from .base import RegexDetector | ||
from ..filth import CredentialFilth | ||
|
||
|
||
class CredentialDetector(RegexDetector): | ||
"""Remove username/password combinations from dirty drity ``text``. | ||
""" | ||
filth_cls = CredentialFilth |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import re | ||
|
||
from .base import RegexDetector | ||
from ..filth import EmailFilth | ||
|
||
|
||
class EmailDetector(RegexDetector): | ||
"""Use regular expression magic to remove email addresses from dirty | ||
dirty ``text``. This method also catches email addresses like ``john at | ||
gmail.com``. | ||
""" | ||
filth_cls = EmailFilth |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import re | ||
|
||
import textblob | ||
|
||
from .base import RegexDetector | ||
from ..filth import NameFilth | ||
from ..utils import CanonicalStringSet | ||
|
||
|
||
class NameDetector(RegexDetector): | ||
"""Use part of speech tagging to clean proper nouns out of the dirty dirty | ||
``text``. Disallow particular nouns by adding them to the | ||
``NameDetector.disallowed_nouns`` set. | ||
""" | ||
filth_cls = NameFilth | ||
|
||
disallowed_nouns = CanonicalStringSet(["skype"]) | ||
|
||
def iter_filth(self, text): | ||
|
||
if not isinstance(self.disallowed_nouns, CanonicalStringSet): | ||
raise TypeError( | ||
'NameDetector.disallowed_nouns must be CanonicalStringSet' | ||
) | ||
|
||
# find the set of proper nouns using textblob. | ||
proper_nouns = set() | ||
blob = textblob.TextBlob(text) | ||
for word, part_of_speech in blob.tags: | ||
is_proper_noun = part_of_speech in ("NNP", "NNPS") | ||
if is_proper_noun and word.lower() not in self.disallowed_nouns: | ||
proper_nouns.add(word) | ||
|
||
# use a regex to replace the proper nouns by first escaping any | ||
# lingering punctuation in the regex | ||
# http://stackoverflow.com/a/4202559/564709 | ||
if proper_nouns: | ||
re_list = [] | ||
for proper_noun in proper_nouns: | ||
re_list.append(r'\b' + re.escape(proper_noun) + r'\b') | ||
self.filth_cls.regex = re.compile('|'.join(re_list)) | ||
else: | ||
self.filth_cls.regex = None | ||
return super(NameDetector, self).iter_filth(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import phonenumbers | ||
|
||
from .base import Detector | ||
from ..filth import PhoneFilth | ||
|
||
|
||
class PhoneDetector(Detector): | ||
"""Remove phone numbers from dirty dirty ``text`` using | ||
`python-phonenumbers | ||
<https://github.com/daviddrysdale/python-phonenumbers>`_, a port of a | ||
Google project to correctly format phone numbers in text. | ||
``region`` specifies the best guess region to start with (default: | ||
``"US"``). Specify ``None`` to only consider numbers with a leading | ||
``+`` to be considered. | ||
""" | ||
region = 'US' | ||
|
||
def iter_filth(self, text): | ||
# create a copy of text to handle multiple phone numbers correctly | ||
for match in phonenumbers.PhoneNumberMatcher(text, self.region): | ||
yield PhoneFilth( | ||
beg=match.start, | ||
end=match.end, | ||
text=match.raw_string, | ||
) |
Oops, something went wrong.