Skip to content

Commit

Permalink
Loading analyzer engine & recognizer registry from configuration file (
Browse files Browse the repository at this point in the history
  • Loading branch information
roeybc authored May 1, 2024
1 parent 55bfb8f commit 2805c86
Show file tree
Hide file tree
Showing 28 changed files with 1,234 additions and 60 deletions.
20 changes: 18 additions & 2 deletions presidio-analyzer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,32 @@ FROM python:3.9-slim

ARG NAME
ARG NLP_CONF_FILE=presidio_analyzer/conf/default.yaml
ARG ANALYZER_CONF_FILE=presidio_analyzer/conf/default_analyzer.yaml
ARG RECOGNIZER_REGISTRY_CONF_FILE=presidio_analyzer/conf/default_recognizers.yaml
ENV PIPENV_VENV_IN_PROJECT=1
ENV PIP_NO_CACHE_DIR=1

ENV ANALYZER_CONF_FILE=${ANALYZER_CONF_FILE}
ENV RECOGNIZER_REGISTRY_CONF_FILE=${RECOGNIZER_REGISTRY_CONF_FILE}
ENV NLP_CONF_FILE=${NLP_CONF_FILE}

COPY ${ANALYZER_CONF_FILE} /usr/bin/${NAME}/${ANALYZER_CONF_FILE}
COPY ${RECOGNIZER_REGISTRY_CONF_FILE} /usr/bin/${NAME}/${RECOGNIZER_REGISTRY_CONF_FILE}
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}

WORKDIR /usr/bin/${NAME}

COPY ./Pipfile* /usr/bin/${NAME}/

# Install essential build tools
RUN apt-get update \
&& apt-get install -y build-essential

RUN pip install pipenv \
&& pipenv install --deploy
# install nlp models specified in conf/default.yaml
# install nlp models specified in NLP_CONF_FILE
COPY ./install_nlp_models.py /usr/bin/${NAME}/
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}

RUN pipenv run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}

COPY . /usr/bin/${NAME}/
Expand Down
13 changes: 11 additions & 2 deletions presidio-analyzer/Dockerfile.transformers
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,27 @@ FROM python:3.9-slim

ARG NAME
ARG NLP_CONF_FILE=presidio_analyzer/conf/transformers.yaml
ARG ANALYZER_CONF_FILE=presidio_analyzer/conf/default_analyzer.yaml
ARG RECOGNIZER_REGISTRY_CONF_FILE=presidio_analyzer/conf/default_recognizers.yaml
ENV PIPENV_VENV_IN_PROJECT=1
ENV PIP_NO_CACHE_DIR=1
WORKDIR /usr/bin/${NAME}

ENV ANALYZER_CONF_FILE=${ANALYZER_CONF_FILE}
ENV RECOGNIZER_REGISTRY_CONF_FILE=${RECOGNIZER_REGISTRY_CONF_FILE}
ENV NLP_CONF_FILE=${NLP_CONF_FILE}

COPY ${ANALYZER_CONF_FILE} /usr/bin/${NAME}/${ANALYZER_CONF_FILE}
COPY ${RECOGNIZER_REGISTRY_CONF_FILE} /usr/bin/${NAME}/${RECOGNIZER_REGISTRY_CONF_FILE}
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}

COPY ./Pipfile* /usr/bin/${NAME}/
RUN pip install pipenv \
&& pipenv install --deploy
RUN pipenv install torch transformers huggingface_hub --skip-lock

# install nlp models specified in conf/default.yaml
# install nlp models specified in NLP_CONF_FILE
COPY ./install_nlp_models.py /usr/bin/${NAME}/
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}

RUN pipenv run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}

Expand Down
12 changes: 11 additions & 1 deletion presidio-analyzer/Dockerfile.windows
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
FROM python:3.9-windowsservercore

ARG NLP_CONF_FILE=presidio_analyzer/conf/default.yaml
ARG ANALYZER_CONF_FILE=presidio_analyzer/conf/default_analyzer.yaml
ARG RECOGNIZER_REGISTRY_CONF_FILE=presidio_analyzer/conf/default_recognizers.yaml
ENV PIPENV_VENV_IN_PROJECT=1
ENV PIP_NO_CACHE_DIR=1
WORKDIR /app

ENV ANALYZER_CONF_FILE=${ANALYZER_CONF_FILE}
ENV RECOGNIZER_REGISTRY_CONF_FILE=${RECOGNIZER_REGISTRY_CONF_FILE}
ENV NLP_CONF_FILE=${NLP_CONF_FILE}

COPY ${ANALYZER_CONF_FILE} /usr/bin/${NAME}/${ANALYZER_CONF_FILE}
COPY ${RECOGNIZER_REGISTRY_CONF_FILE} /usr/bin/${NAME}/${RECOGNIZER_REGISTRY_CONF_FILE}
COPY ${NLP_CONF_FILE} /usr/bin/${NAME}/${NLP_CONF_FILE}

ADD https://aka.ms/vs/16/release/vc_redist.x64.exe .
RUN ./vc_redist.x64.exe /quiet /install

COPY ./Pipfile* .
RUN pip install --upgrade pip
RUN pip install pipenv; pipenv install --deploy

# install nlp models specified in conf/default.yaml
# install nlp models specified in NLP_CONF_FILE
COPY ./install_nlp_models.py .
COPY ${NLP_CONF_FILE} ${NLP_CONF_FILE}
RUN pipenv run python install_nlp_models.py --conf_file $Env:NLP_CONF_FILE
Expand Down
14 changes: 11 additions & 3 deletions presidio-analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from flask import Flask, request, jsonify, Response
from werkzeug.exceptions import HTTPException

from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer import AnalyzerEngine, AnalyzerEngineProvider, AnalyzerRequest

DEFAULT_PORT = "3000"

Expand All @@ -36,8 +35,17 @@ def __init__(self):
self.logger = logging.getLogger("presidio-analyzer")
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
self.app = Flask(__name__)

analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE")
nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE")
recognizer_registry_conf_file = os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE")

self.logger.info("Starting analyzer engine")
self.engine = AnalyzerEngine()
self.engine: AnalyzerEngine = AnalyzerEngineProvider(
analyzer_engine_conf_file=analyzer_conf_file,
nlp_engine_conf_file=nlp_engine_conf_file,
recognizer_registry_conf_file=recognizer_registry_conf_file,
).create_engine()
self.logger.info(WELCOME_MESSAGE)

@self.app.route("/health")
Expand Down
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from presidio_analyzer.remote_recognizer import RemoteRecognizer
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.analyzer_engine_provider import AnalyzerEngineProvider
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
Expand Down Expand Up @@ -45,6 +46,7 @@
"RemoteRecognizer",
"RecognizerRegistry",
"AnalyzerEngine",
"AnalyzerEngineProvider",
"AnalyzerRequest",
"ContextAwareEnhancer",
"LemmaContextAwareEnhancer",
Expand Down
21 changes: 15 additions & 6 deletions presidio-analyzer/presidio_analyzer/analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@
import regex as re

from presidio_analyzer import (
RecognizerRegistry,
RecognizerResult,
EntityRecognizer,
)

from presidio_analyzer.app_tracer import AppTracer
from presidio_analyzer.context_aware_enhancers import (
ContextAwareEnhancer,
LemmaContextAwareEnhancer,
)
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider, NlpArtifacts
from presidio_analyzer.recognizer_registry import (
RecognizerRegistry,
RecognizerRegistryProvider,
)

logger = logging.getLogger("presidio-analyzer")

Expand Down Expand Up @@ -58,9 +62,6 @@ def __init__(
provider = NlpEngineProvider()
nlp_engine = provider.create_engine()

if not registry:
logger.info("registry not provided, creating default.")
registry = RecognizerRegistry()
if not app_tracer:
app_tracer = AppTracer()
self.app_tracer = app_tracer
Expand All @@ -71,14 +72,22 @@ def __init__(
if not self.nlp_engine.is_loaded():
self.nlp_engine.load()

self.registry = registry
if not registry:
logger.info("registry not provided, creating default.")
provider = RecognizerRegistryProvider(
registry_configuration={"supported_languages": self.supported_languages}
)
registry = provider.create_recognizer_registry()
registry.add_nlp_recognizer(nlp_engine=self.nlp_engine)

# load all recognizers
# added to support the previous interface
if not registry.recognizers:
registry.load_predefined_recognizers(
nlp_engine=self.nlp_engine, languages=self.supported_languages
)

self.registry = registry

self.log_decision_process = log_decision_process
self.default_score_threshold = default_score_threshold

Expand Down
144 changes: 144 additions & 0 deletions presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import yaml
import logging
from pathlib import Path
from typing import Optional, Union, List, Dict, Any

from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngine
from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider

logger = logging.getLogger("presidio-analyzer")


class AnalyzerEngineProvider:
"""
Utility function for loading Presidio Analyzer.
Use this class to load presidio analyzer engine from a yaml file
:param analyzer_engine_conf_file: the path to the analyzer configuration file
:param nlp_engine_conf_file: the path to the nlp engine configuration file
:param recognizer_registry_conf_file: the path to the recognizer
registry configuration file
"""

def __init__(
self,
analyzer_engine_conf_file: Optional[Union[Path, str]] = None,
nlp_engine_conf_file: Optional[Union[Path, str]] = None,
recognizer_registry_conf_file: Optional[Union[Path, str]] = None,
):
self.configuration = self._get_configuration(
conf_file=analyzer_engine_conf_file
)
self.nlp_engine_conf_file = nlp_engine_conf_file
self.recognizer_registry_conf_file = recognizer_registry_conf_file

def _get_configuration(
self, conf_file: Optional[Union[Path, str]]
) -> Union[Dict[str, Any]]:
"""Retrieve the analyzer engine configuration from the provided file."""

if not conf_file:
default_conf_file = self._get_full_conf_path()
configuration = yaml.safe_load(open(default_conf_file))
logger.info(f"Analyzer Engine configuration file "
f"not provided. Using {default_conf_file}.")
else:
try:
logger.info(f"Reading analyzer configuration from {conf_file}")
configuration = yaml.safe_load(open(conf_file))
except IOError:
logger.warning(
f"configuration file {conf_file} not found. "
f"Using default config."
)
configuration = yaml.safe_load(open(self._get_full_conf_path()))
except Exception:
print(f"Failed to parse file {conf_file}, resorting to default")
configuration = yaml.safe_load(open(self._get_full_conf_path()))

return configuration

def create_engine(self) -> AnalyzerEngine:
"""
Load Presidio Analyzer from yaml configuration file.
:return: analyzer engine initialized with yaml configuration
"""

nlp_engine = self._load_nlp_engine()
supported_languages = self.configuration.get("supported_languages", ["en"])
default_score_threshold = self.configuration.get("default_score_threshold", 0)

registry = self._load_recognizer_registry(
supported_languages=supported_languages, nlp_engine=nlp_engine
)

analyzer = AnalyzerEngine(
nlp_engine=nlp_engine,
registry=registry,
supported_languages=supported_languages,
default_score_threshold=default_score_threshold,
)

return analyzer

def _load_recognizer_registry(
self,
supported_languages: List[str],
nlp_engine: NlpEngine,
) -> RecognizerRegistry:
if self.recognizer_registry_conf_file:
logger.info(f"Reading recognizer registry "
f"configuration from {self.recognizer_registry_conf_file}")
provider = RecognizerRegistryProvider(
conf_file=self.recognizer_registry_conf_file
)
elif "recognizer_registry" in self.configuration:
registry_configuration = self.configuration["recognizer_registry"]
provider = RecognizerRegistryProvider(
registry_configuration={
**registry_configuration,
"supported_languages": supported_languages,
}
)
else:
logger.warning(
"configuration file is missing for 'recognizer_registry'. "
"Using default configuration for recognizer registry"
)
registry_configuration = self.configuration.get("recognizer_registry", {})
provider = RecognizerRegistryProvider(
registry_configuration={
**registry_configuration,
"supported_languages": supported_languages,
}
)
registry = provider.create_recognizer_registry()
if nlp_engine:
registry.add_nlp_recognizer(nlp_engine)
return registry

def _load_nlp_engine(self) -> NlpEngine:
if self.nlp_engine_conf_file:
logger.info(f"Reading nlp configuration from {self.nlp_engine_conf_file}")
provider = NlpEngineProvider(conf_file=self.nlp_engine_conf_file)
elif "nlp_configuration" in self.configuration:
nlp_configuration = self.configuration["nlp_configuration"]
provider = NlpEngineProvider(nlp_configuration=nlp_configuration)
else:
logger.warning(
"configuration file is missing for 'nlp_configuration'."
"Using default configuration for nlp engine"
)
provider = NlpEngineProvider()

return provider.create_engine()

@staticmethod
def _get_full_conf_path(
default_conf_file: Union[Path, str] = "default_analyzer.yaml"
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent, "conf", default_conf_file)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
supported_languages:
- en
default_score_threshold: 0
Loading

0 comments on commit 2805c86

Please sign in to comment.