-
Notifications
You must be signed in to change notification settings - Fork 559
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Analyzer container size reduction + faster builds (#252)
* updated spacy model version + demo text * removed lazy loading of spacy models, ignoring deprecation warnings on tests * Changed base image from Alpine to Slim Buster to support faster pip installations, updated Spacy model version to 2.2.5 * Removed PIP cache to reduce image size * Updated base image to use specific venv * refined logger, fixed unit tests, addressed linting issues * Printing model version * verbose asserts for demo text * Update recognizers_store_api.py which currently throws an exception of type in logger
- Loading branch information
Showing
40 changed files
with
859 additions
and
698 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,31 @@ | ||
FROM python:3.7.1-alpine3.8 | ||
FROM python:3.7-slim | ||
|
||
ARG re2_version="2018-12-01" | ||
ARG NAME=presidio-analyzer | ||
|
||
ENV PIP_NO_CACHE_DIR true | ||
|
||
COPY ./${NAME}/Pipfile /usr/bin/${NAME}/Pipfile | ||
COPY ./${NAME}/Pipfile.lock /usr/bin/${NAME}/Pipfile.lock | ||
|
||
WORKDIR /usr/bin/${NAME} | ||
|
||
RUN apk --update add --no-cache g++ && \ | ||
apk --update add --no-cache --virtual build_deps make tar wget clang && \ | ||
RUN apt-get update -qq \ | ||
&& DEBIAN_FRONTEND=noninteractive apt-get -y install --no-install-recommends \ | ||
wget build-essential && \ | ||
wget -O re2.tar.gz https://github.com/google/re2/archive/${re2_version}.tar.gz && \ | ||
mkdir re2 && tar --extract --file "re2.tar.gz" --directory "re2" --strip-components 1 && \ | ||
cd re2 && make install && cd .. && rm -rf re2 && rm re2.tar.gz && \ | ||
apk add --virtual build_deps make automake gcc g++ subversion python3-dev | ||
apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/{apt,dpkg,cache,log}/ | ||
|
||
|
||
# Making sure we have pipenv | ||
RUN pip3 install pipenv | ||
RUN pip install pipenv | ||
# Updating setuptools | ||
RUN pip3 install --upgrade setuptools | ||
RUN pip install --upgrade setuptools | ||
# Installing specified packages from Pipfile.lock | ||
RUN pipenv sync | ||
RUN bash -c 'PIPENV_VENV_IN_PROJECT=1 pipenv sync' | ||
|
||
# Print to screen the installed packages for easy debugging | ||
RUN pipenv run pip freeze | ||
|
||
RUN apk del build_deps |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,28 @@ | ||
ARG REGISTRY=presidio.azurecr.io | ||
ARG PRESIDIO_DEPS_LABEL=latest | ||
ARG PRESIDIO_DEPS_LABEL=latest | ||
|
||
FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL} | ||
FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL} | ||
|
||
ARG NAME=presidio-analyzer | ||
WORKDIR /usr/bin/${NAME} | ||
ADD ./${NAME} /usr/bin/${NAME} | ||
|
||
RUN pipenv install --dev --sequential && \ | ||
pipenv run pylint analyzer && \ | ||
# Print venv information | ||
RUN pipenv --venv | ||
RUN pipenv run pip freeze | ||
|
||
RUN pipenv install pylint==2.3.1 flake8 pytest --skip-lock | ||
|
||
RUN pipenv run pylint analyzer && \ | ||
pipenv run flake8 analyzer --exclude "*pb2*.py" && \ | ||
pipenv run pytest --log-cli-level=0 | ||
|
||
#---------------------------- | ||
|
||
FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL} | ||
FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL} | ||
|
||
ARG NAME=presidio-analyzer | ||
ADD ./${NAME}/analyzer /usr/bin/${NAME}/analyzer | ||
WORKDIR /usr/bin/${NAME}/analyzer | ||
|
||
CMD pipenv run python __main__.py serve --env-grpc-port | ||
CMD pipenv run python __main__.py serve --env-grpc-port |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,23 @@ | ||
import os | ||
import sys | ||
import os # noqa | ||
import sys # noqa | ||
|
||
# pylint: disable=unused-import,wrong-import-position | ||
# bug #602: Fix imports issue in python | ||
sys.path.append(os.path.dirname(os.path.dirname( | ||
os.path.abspath(__file__))) + "/analyzer") | ||
os.path.abspath(__file__))) + "/analyzer") # noqa | ||
|
||
from analyzer.analysis_explanation import AnalysisExplanation # noqa | ||
from analyzer.pattern import Pattern # noqa: F401 | ||
from analyzer.entity_recognizer import EntityRecognizer # noqa: F401 | ||
from analyzer.local_recognizer import LocalRecognizer # noqa: F401 | ||
from analyzer.recognizer_result import RecognizerResult # noqa: F401 | ||
from analyzer.pattern_recognizer import PatternRecognizer # noqa: F401 | ||
from analyzer.remote_recognizer import RemoteRecognizer # noqa: F401 | ||
from analyzer.recognizer_registry.recognizer_registry import ( # noqa: F401 | ||
RecognizerRegistry | ||
) | ||
from analyzer.analyzer_engine import AnalyzerEngine # noqa | ||
from analyzer.presidio_logger import PresidioLogger | ||
from analyzer.analysis_explanation import AnalysisExplanation | ||
from analyzer.pattern import Pattern | ||
from analyzer.entity_recognizer import EntityRecognizer | ||
from analyzer.local_recognizer import LocalRecognizer | ||
from analyzer.recognizer_result import RecognizerResult | ||
from analyzer.pattern_recognizer import PatternRecognizer | ||
from analyzer.remote_recognizer import RemoteRecognizer | ||
from analyzer.recognizer_registry.recognizer_registry import RecognizerRegistry | ||
from analyzer.analyzer_engine import AnalyzerEngine | ||
|
||
|
||
__all__ = ['PresidioLogger', 'AnalysisExplanation', 'Pattern', | ||
'EntityRecognizer', 'LocalRecognizer', 'RecognizerResult', | ||
'PatternRecognizer', 'RemoteRecognizer', 'RecognizerRegistry', | ||
'AnalyzerEngine'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
44 changes: 29 additions & 15 deletions
44
presidio-analyzer/analyzer/predefined_recognizers/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,29 @@ | ||
# pylint: disable=unused-import | ||
from .credit_card_recognizer import CreditCardRecognizer # noqa: F401 | ||
from .spacy_recognizer import SpacyRecognizer # noqa: F401 | ||
from .crypto_recognizer import CryptoRecognizer # noqa: F401 | ||
from .domain_recognizer import DomainRecognizer # noqa: F401 | ||
from .email_recognizer import EmailRecognizer # noqa: F401 | ||
from .iban_recognizer import IbanRecognizer # noqa: F401 | ||
from .ip_recognizer import IpRecognizer # noqa: F401 | ||
from .uk_nhs_recognizer import NhsRecognizer # noqa: F401 | ||
from .us_bank_recognizer import UsBankRecognizer # noqa: F401 | ||
from .us_driver_license_recognizer import UsLicenseRecognizer # noqa: F401 | ||
from .us_itin_recognizer import UsItinRecognizer # noqa: F401 | ||
from .us_passport_recognizer import UsPassportRecognizer # noqa: F401 | ||
from .us_phone_recognizer import UsPhoneRecognizer # noqa: F401 | ||
from .us_ssn_recognizer import UsSsnRecognizer # noqa: F401 | ||
from .credit_card_recognizer import CreditCardRecognizer | ||
from .crypto_recognizer import CryptoRecognizer | ||
from .domain_recognizer import DomainRecognizer | ||
from .email_recognizer import EmailRecognizer | ||
from .iban_recognizer import IbanRecognizer | ||
from .ip_recognizer import IpRecognizer | ||
from .spacy_recognizer import SpacyRecognizer | ||
from .uk_nhs_recognizer import NhsRecognizer | ||
from .us_bank_recognizer import UsBankRecognizer | ||
from .us_driver_license_recognizer import UsLicenseRecognizer | ||
from .us_itin_recognizer import UsItinRecognizer | ||
from .us_passport_recognizer import UsPassportRecognizer | ||
from .us_phone_recognizer import UsPhoneRecognizer | ||
from .us_ssn_recognizer import UsSsnRecognizer | ||
|
||
__all__ = ["CreditCardRecognizer", | ||
"CryptoRecognizer", | ||
"DomainRecognizer", | ||
"EmailRecognizer", | ||
"IbanRecognizer", | ||
"IpRecognizer", | ||
"SpacyRecognizer", | ||
"NhsRecognizer", | ||
"UsBankRecognizer", | ||
"UsLicenseRecognizer", | ||
"UsItinRecognizer", | ||
"UsPassportRecognizer", | ||
"UsPhoneRecognizer", | ||
"UsSsnRecognizer"] |
3 changes: 1 addition & 2 deletions
3
presidio-analyzer/analyzer/predefined_recognizers/domain_recognizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.