Skip to content

Commit

Permalink
Analyzer container size reduction + faster builds (#252)
Browse files Browse the repository at this point in the history
* updated spacy model version + demo text

* removed lazy loading of spacy models, ignoring deprecation warnings on tests

* Changed base image from Alpine to Slim Buster to support faster pip installations, updated Spacy model version to 2.2.5

* Removed PIP cache to reduce image size

* Updated base image to use specific venv

* refined logger, fixed unit tests, addressed linting issues

* Printing model version

* verbose asserts for demo text

* Update recognizers_store_api.py which currently throws an exception of type in logger
  • Loading branch information
omri374 committed Jan 1, 2020
1 parent 80fea8f commit c89bf32
Show file tree
Hide file tree
Showing 40 changed files with 859 additions and 698 deletions.
21 changes: 13 additions & 8 deletions Dockerfile.python.deps
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
FROM python:3.7.1-alpine3.8
FROM python:3.7-slim

ARG re2_version="2018-12-01"
ARG NAME=presidio-analyzer

ENV PIP_NO_CACHE_DIR true

COPY ./${NAME}/Pipfile /usr/bin/${NAME}/Pipfile
COPY ./${NAME}/Pipfile.lock /usr/bin/${NAME}/Pipfile.lock

WORKDIR /usr/bin/${NAME}

RUN apk --update add --no-cache g++ && \
apk --update add --no-cache --virtual build_deps make tar wget clang && \
RUN apt-get update -qq \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install --no-install-recommends \
wget build-essential && \
wget -O re2.tar.gz https://github.com/google/re2/archive/${re2_version}.tar.gz && \
mkdir re2 && tar --extract --file "re2.tar.gz" --directory "re2" --strip-components 1 && \
cd re2 && make install && cd .. && rm -rf re2 && rm re2.tar.gz && \
apk add --virtual build_deps make automake gcc g++ subversion python3-dev
apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/{apt,dpkg,cache,log}/


# Making sure we have pipenv
RUN pip3 install pipenv
RUN pip install pipenv
# Updating setuptools
RUN pip3 install --upgrade setuptools
RUN pip install --upgrade setuptools
# Installing specified packages from Pipfile.lock
RUN pipenv sync
RUN bash -c 'PIPENV_VENV_IN_PROJECT=1 pipenv sync'

# Print to screen the installed packages for easy debugging
RUN pipenv run pip freeze

RUN apk del build_deps
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ $(BINS): vendor

.PHONY: docker-build-deps
docker-build-deps:
-docker pull $(DOCKER_REGISTRY)/$(GOLANG_DEPS):$(PRESIDIO_DEPS_LABEL) ||:
-docker pull $(DOCKER_REGISTRY)/$(PYTHON_DEPS):$(PRESIDIO_DEPS_LABEL) ||:
-docker pull $(DOCKER_REGISTRY)/$(GOLANG_DEPS):$(PRESIDIO_DEPS_LABEL) || echo "\nCould not pull base Go image from registry, building locally. If you planned to build locally, the previous error message could be ignored\n"
-docker pull $(DOCKER_REGISTRY)/$(PYTHON_DEPS):$(PRESIDIO_DEPS_LABEL) || echo "\nCould not pull base Python image from registry, building locally (If you planned to build images locally, the previous error message could be ignored\n"
docker build -t $(DOCKER_REGISTRY)/$(GOLANG_DEPS):$(PRESIDIO_DEPS_LABEL) -f Dockerfile.golang.deps .
docker build -t $(DOCKER_REGISTRY)/$(PYTHON_DEPS):$(PRESIDIO_DEPS_LABEL) -f Dockerfile.python.deps .

Expand Down Expand Up @@ -200,4 +200,4 @@ ifndef HAS_GOMETALINTER
endif

.PHONY: bootstrap
bootstrap: vendor
bootstrap: vendor
17 changes: 11 additions & 6 deletions presidio-analyzer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
ARG REGISTRY=presidio.azurecr.io
ARG PRESIDIO_DEPS_LABEL=latest
ARG PRESIDIO_DEPS_LABEL=latest

FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL}
FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL}

ARG NAME=presidio-analyzer
WORKDIR /usr/bin/${NAME}
ADD ./${NAME} /usr/bin/${NAME}

RUN pipenv install --dev --sequential && \
pipenv run pylint analyzer && \
# Print venv information
RUN pipenv --venv
RUN pipenv run pip freeze

RUN pipenv install pylint==2.3.1 flake8 pytest --skip-lock

RUN pipenv run pylint analyzer && \
pipenv run flake8 analyzer --exclude "*pb2*.py" && \
pipenv run pytest --log-cli-level=0

#----------------------------

FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL}
FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL}

ARG NAME=presidio-analyzer
ADD ./${NAME}/analyzer /usr/bin/${NAME}/analyzer
WORKDIR /usr/bin/${NAME}/analyzer

CMD pipenv run python __main__.py serve --env-grpc-port
CMD pipenv run python __main__.py serve --env-grpc-port
4 changes: 2 additions & 2 deletions presidio-analyzer/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ name = "pypi"

[packages]
cython = "*"
spacy = "*"
en_core_web_lg = {file = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz"}
spacy = "==2.2.3"
en_core_web_lg = {file = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz"}
regex = "*"
pyre2 = {file = "https://github.com/torosent/pyre2/archive/release/0.2.23.zip"}
grpcio = "*"
Expand Down
800 changes: 438 additions & 362 deletions presidio-analyzer/Pipfile.lock

Large diffs are not rendered by default.

34 changes: 19 additions & 15 deletions presidio-analyzer/analyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import os
import sys
import os # noqa
import sys # noqa

# pylint: disable=unused-import,wrong-import-position
# bug #602: Fix imports issue in python
sys.path.append(os.path.dirname(os.path.dirname(
os.path.abspath(__file__))) + "/analyzer")
os.path.abspath(__file__))) + "/analyzer") # noqa

from analyzer.analysis_explanation import AnalysisExplanation # noqa
from analyzer.pattern import Pattern # noqa: F401
from analyzer.entity_recognizer import EntityRecognizer # noqa: F401
from analyzer.local_recognizer import LocalRecognizer # noqa: F401
from analyzer.recognizer_result import RecognizerResult # noqa: F401
from analyzer.pattern_recognizer import PatternRecognizer # noqa: F401
from analyzer.remote_recognizer import RemoteRecognizer # noqa: F401
from analyzer.recognizer_registry.recognizer_registry import ( # noqa: F401
RecognizerRegistry
)
from analyzer.analyzer_engine import AnalyzerEngine # noqa
from analyzer.presidio_logger import PresidioLogger
from analyzer.analysis_explanation import AnalysisExplanation
from analyzer.pattern import Pattern
from analyzer.entity_recognizer import EntityRecognizer
from analyzer.local_recognizer import LocalRecognizer
from analyzer.recognizer_result import RecognizerResult
from analyzer.pattern_recognizer import PatternRecognizer
from analyzer.remote_recognizer import RemoteRecognizer
from analyzer.recognizer_registry.recognizer_registry import RecognizerRegistry
from analyzer.analyzer_engine import AnalyzerEngine


__all__ = ['PresidioLogger', 'AnalysisExplanation', 'Pattern',
'EntityRecognizer', 'LocalRecognizer', 'RecognizerResult',
'PatternRecognizer', 'RemoteRecognizer', 'RecognizerRegistry',
'AnalyzerEngine']
30 changes: 21 additions & 9 deletions presidio-analyzer/analyzer/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,24 @@
import analyze_pb2
import analyze_pb2_grpc
from concurrent import futures
import time
from os import sys, path
import os
import sys
import time
from google.protobuf.json_format import MessageToJson
from knack import CLI
from knack.arguments import ArgumentsContext
from knack.commands import CLICommandsLoader, CommandGroup
from knack.help import CLIHelp
from knack.help_files import helps

# bug #602: Fix imports issue in python
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from analyzer_engine import AnalyzerEngine # noqa
from recognizer_registry.recognizer_registry import RecognizerRegistry # noqa
from nlp_engine.spacy_nlp_engine import SpacyNlpEngine # noqa
from presidio_logger import PresidioLogger # noqa

logging.getLogger().setLevel("INFO")

WELCOME_MESSAGE = r"""
Expand Down Expand Up @@ -47,9 +49,7 @@
license is AC432223" --fields "PERSON" "US_DRIVER_LICENSE"
"""

loglevel = os.environ.get("LOG_LEVEL", "INFO")
logging.basicConfig(
format='%(asctime)s:%(levelname)s:%(message)s', level=loglevel)
logger = PresidioLogger()


class PresidioCLIHelp(CLIHelp):
Expand All @@ -63,24 +63,36 @@ def __init__(self, cli_ctx=None):
def serve_command_handler(enable_trace_pii,
env_grpc_port=False,
grpc_port=3000):

logger.info("Starting GRPC server")
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
logger.info("GRPC started")

logger.info("Creating RecognizerRegistry")
registry = RecognizerRegistry()
logger.info("RecognizerRegistry created")
logger.info("Creating SpacyNlpEngine")
nlp_engine = SpacyNlpEngine()
logger.info("SpacyNlpEngine created")

analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server(
AnalyzerEngine(registry=registry,
nlp_engine=nlp_engine,
enable_trace_pii=enable_trace_pii),
server)

logger.info("Added AnalyzeServiceServicer to server")

if env_grpc_port:
logger.info("Getting port {}".format(env_grpc_port))
port = os.environ.get('GRPC_PORT')
if port is not None or port != '':
grpc_port = int(port)
else:
logger.info("env_grpc_port not provided. "
"Using grpc_port {}".format(grpc_port))

server.add_insecure_port('[::]:' + str(grpc_port))
logging.info("Starting GRPC listener at port %d", grpc_port)
logger.info("Starting GRPC listener at port {}".format(grpc_port))
server.start()
try:
while True:
Expand Down
16 changes: 10 additions & 6 deletions presidio-analyzer/analyzer/analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import analyze_pb2_grpc
import common_pb2

from analyzer.logger import Logger
from analyzer import PresidioLogger
from analyzer.app_tracer import AppTracer

DEFAULT_LANGUAGE = "en"
logger = Logger()
logger = PresidioLogger("presidio")


class AnalyzerEngine(analyze_pb2_grpc.AnalyzeServiceServicer):
Expand All @@ -31,9 +31,13 @@ def __init__(self, registry=None, nlp_engine=None,
for detected entities to be returned
"""
if not nlp_engine:
logger.info("nlp_engine not provided. Creating new "
"SpacyNlpEngine instance")
from analyzer.nlp_engine import SpacyNlpEngine
nlp_engine = SpacyNlpEngine()
if not registry:
logger.info("Recognizer registry not provided. "
"Creating default RecognizerRegistry instance")
from analyzer import RecognizerRegistry
registry = RecognizerRegistry()
if not app_tracer:
Expand Down Expand Up @@ -98,7 +102,7 @@ def Apply(self, request, context):
def __remove_duplicates(results):
"""
Removes each result which has a span contained in a
result's span with ahigher score
result's span with a higher score
:param results: List[RecognizerResult]
:return: List[RecognizerResult]
"""
Expand All @@ -117,9 +121,9 @@ def __remove_duplicates(results):
for filtered in filtered_results:
# If result is equal to or substring of
# one of the other results
if result.start >= filtered.start \
and result.end <= filtered.end \
and result.entity_type == filtered.entity_type:

if result.contained_in(filtered) and \
result.entity_type == filtered.entity_type:
valid_result = False
break

Expand Down
4 changes: 2 additions & 2 deletions presidio-analyzer/analyzer/app_tracer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from analyzer.logger import Logger
from analyzer import PresidioLogger


class AppTracer:
Expand All @@ -8,7 +8,7 @@ class AppTracer:
This can be useful for analyzing the detection accuracy of the system."""
def __init__(self, enabled=True):

self.logger = Logger('Interpretability')
self.logger = PresidioLogger('Interpretability')
self.logger.set_level("INFO")
self.enabled = enabled

Expand Down
6 changes: 3 additions & 3 deletions presidio-analyzer/analyzer/entity_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import abstractmethod
import copy
from abc import abstractmethod

from analyzer.logger import Logger
from analyzer import PresidioLogger


class EntityRecognizer:
Expand Down Expand Up @@ -36,7 +36,7 @@ def __init__(self, supported_entities, name=None, supported_language="en",
self.version = version
self.is_loaded = False

self.logger = Logger()
self.logger = PresidioLogger()
self.load()
self.logger.info("Loaded recognizer: %s", self.name)
self.is_loaded = True
Expand Down
13 changes: 7 additions & 6 deletions presidio-analyzer/analyzer/nlp_engine/spacy_nlp_engine.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import spacy
from spacy.cli import download

from analyzer.logger import Logger
from analyzer import PresidioLogger
from analyzer.nlp_engine import NlpArtifacts, NlpEngine
logger = Logger()

logger = PresidioLogger()


class SpacyNlpEngine(NlpEngine):
Expand All @@ -14,13 +14,14 @@ class SpacyNlpEngine(NlpEngine):
"""

def __init__(self):
logger.info("Loading NLP model...")
logger.info("Loading NLP model: spaCy en_core_web_lg")

# Download model lazily if it wasn't previously installed
download('en_core_web_lg')
self.nlp = {"en": spacy.load("en_core_web_lg",
disable=['parser', 'tagger'])}

logger.info("Printing spaCy model and package details:"
"\n\n {}\n\n".format(spacy.info("en_core_web_lg")))

def process_text(self, text, language):
""" Execute the SpaCy NLP pipeline on the given text
and language
Expand Down
44 changes: 29 additions & 15 deletions presidio-analyzer/analyzer/predefined_recognizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
# pylint: disable=unused-import
from .credit_card_recognizer import CreditCardRecognizer # noqa: F401
from .spacy_recognizer import SpacyRecognizer # noqa: F401
from .crypto_recognizer import CryptoRecognizer # noqa: F401
from .domain_recognizer import DomainRecognizer # noqa: F401
from .email_recognizer import EmailRecognizer # noqa: F401
from .iban_recognizer import IbanRecognizer # noqa: F401
from .ip_recognizer import IpRecognizer # noqa: F401
from .uk_nhs_recognizer import NhsRecognizer # noqa: F401
from .us_bank_recognizer import UsBankRecognizer # noqa: F401
from .us_driver_license_recognizer import UsLicenseRecognizer # noqa: F401
from .us_itin_recognizer import UsItinRecognizer # noqa: F401
from .us_passport_recognizer import UsPassportRecognizer # noqa: F401
from .us_phone_recognizer import UsPhoneRecognizer # noqa: F401
from .us_ssn_recognizer import UsSsnRecognizer # noqa: F401
from .credit_card_recognizer import CreditCardRecognizer
from .crypto_recognizer import CryptoRecognizer
from .domain_recognizer import DomainRecognizer
from .email_recognizer import EmailRecognizer
from .iban_recognizer import IbanRecognizer
from .ip_recognizer import IpRecognizer
from .spacy_recognizer import SpacyRecognizer
from .uk_nhs_recognizer import NhsRecognizer
from .us_bank_recognizer import UsBankRecognizer
from .us_driver_license_recognizer import UsLicenseRecognizer
from .us_itin_recognizer import UsItinRecognizer
from .us_passport_recognizer import UsPassportRecognizer
from .us_phone_recognizer import UsPhoneRecognizer
from .us_ssn_recognizer import UsSsnRecognizer

__all__ = ["CreditCardRecognizer",
"CryptoRecognizer",
"DomainRecognizer",
"EmailRecognizer",
"IbanRecognizer",
"IpRecognizer",
"SpacyRecognizer",
"NhsRecognizer",
"UsBankRecognizer",
"UsLicenseRecognizer",
"UsItinRecognizer",
"UsPassportRecognizer",
"UsPhoneRecognizer",
"UsSsnRecognizer"]
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import tldextract

from analyzer import Pattern
from analyzer import PatternRecognizer
from analyzer import Pattern, PatternRecognizer

# pylint: disable=line-too-long
REGEX = r'\b(((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,86}[a-zA-Z0-9]))\.(([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,73}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25})))|((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,162}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25}))))\b' # noqa: E501' # noqa: E501
Expand Down
Loading

0 comments on commit c89bf32

Please sign in to comment.