Skip to content

Commit c827828

Browse files
authored
Merge pull request #20 from baloise/main
PR for new image build
2 parents cfeffac + 9e5a481 commit c827828

File tree

6 files changed

+303
-49
lines changed

6 files changed

+303
-49
lines changed

Dockerfile

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,18 @@
1-
FROM alpine:latest
1+
FROM python:3.12-bookworm
22

33
# set metadata
44
LABEL maintainer="culmat, trichie, robbizbal" \
55
org.opencontainers.image.description="Yo-Yo-Maskr application Docker image" \
66
version="0.1.0"
77

88
# set poetry environment variables
9-
ARG POETRY_FLAGS="--without dev"
9+
ARG POETRY_FLAGS="--only main"
1010

1111
# set default environment variables
1212
ENV OLLAMA_BASE_URL=http://localhost:11434 \
1313
OLLAMA_MODEL=llama3.2:latest \
1414
HTTPX_CLIENT_VERIFY=
1515

16-
# install Python and dependencies
17-
RUN apk add --no-cache --update \
18-
python3 \
19-
py3-pip \
20-
gcc \
21-
python3-dev \
22-
make \
23-
bash \
24-
&& rm -rf ~/.cache/* /usr/local/share/man /tmp/*
25-
26-
RUN python3 -m pip install pipx --break-system-packages \
27-
&& python3 -m pipx ensurepath \
28-
&& python3 -m pipx completions
29-
3016
# add app src
3117
COPY . /app/
3218

@@ -37,11 +23,20 @@ WORKDIR /app
3723
RUN chmod +x entrypoint.sh setup.sh
3824

3925
# create user
40-
RUN adduser -Ds /bin/bash anon && chown -R anon: /app
26+
#RUN useradd -ms /bin/bash anon && chown -R anon: /app
27+
RUN useradd -ms /bin/bash -G 0 anon
28+
29+
# set permissions - for OpenShift
30+
RUN chgrp -R 0 /app && chmod -R g+rwX /app
4131

4232
# switch to user
4333
USER anon
4434

35+
RUN python3 -m pip install --upgrade pip \
36+
&& python3 -m pip install pipx \
37+
&& python3 -m pipx ensurepath \
38+
&& python3 -m pipx completions
39+
4540
# run app setup script
4641
RUN "./setup.sh"
4742

setup.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/bin/sh
2+
export PATH="$HOME/.local/bin:$PATH"
13
pipx install poetry
24
pipx ensurepath
35
. ~/.bashrc

src/utils/ano_spacy.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import spacy
2+
import spacy.cli
3+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
4+
#from presidio_anonymizer import AnonymizerEngine
5+
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
6+
from presidio_analyzer.predefined_recognizers import SpacyRecognizer
7+
from presidio_analyzer import PatternRecognizer
8+
9+
class Anon_Spacy:
10+
def __init__(self):
11+
languages = ['en','de','fr','it']
12+
size = "lg"
13+
gernres = {lang: "web" if lang == 'en' else "news" for lang in languages}
14+
self.models = {lang: f"{lang}_core_{gernres[lang]}_{size}" for lang in languages}
15+
self.models_loaded = []
16+
17+
def analyze_text(self, text, language='de',entities=['PERSON']):
18+
if not language in self.models:
19+
print(f"WARN: language '{language}' not supported. Supported languages are {self.models.keys()}.")
20+
return self.get_analyzer(language,entities).analyze(text=text, language=language, entities=["PERSON"])
21+
22+
def get_analyzer(self,language='de',entities=['PERSON']):
23+
self.ensure_model_loaded(self.models[language])
24+
nlp_engine = SpacyNlpEngine(models=[{"lang_code": language, "model_name": self.models[language]}])
25+
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=[language])
26+
analyzer.registry.add_recognizer(SpacyRecognizer(supported_language=language, supported_entities=entities))
27+
return analyzer
28+
29+
def ensure_model_loaded(self,model_name):
30+
if model_name in self.models_loaded:
31+
print(f"Model '{model_name}' already loaded.")
32+
return
33+
print(f"Loading model '{model_name}'.")
34+
try:
35+
# Try to load the model
36+
return spacy.load(model_name)
37+
except OSError:
38+
# If the model is not found, download it
39+
print(f"Model '{model_name}' not found. Downloading...")
40+
spacy.cli.download(model_name)
41+
print(f"Model '{model_name}' downloaded successfully.")
42+
return spacy.load(model_name)
43+
finally:
44+
self.models_loaded.append(model_name)
45+
print(f"Model '{model_name}' loaded.")
46+
47+
# Add custom recognizers if needed
48+
# Example: Adding a custom recognizer for French phone numbers
49+
# fr_phone_recognizer = PatternRecognizer(supported_entity="FR_PHONE_NUMBER",
50+
# patterns=[{"name": "FR_PHONE",
51+
# "regex": r"(\+33|0)[1-9]\d{8}",
52+
# "score": 0.9}])
53+
# analyzer.registry.add_recognizer(fr_phone_recognizer)
54+
55+
56+
# Initialize the anonymizer engine
57+
#anonymizer = AnonymizerEngine()
58+
59+
60+
# def anonymize_text(text, language):
61+
# return anonymizer.anonymize(text=text, analyzer_results=analyze_text(text,language))

src/utils/anonymize.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import regex as re
2+
import dill
3+
import ahocorasick
4+
from multiprocessing import Pool
5+
6+
7+
class Anon:
8+
def __init__(self, names):
9+
self.pattern = re.compile(r'\b(' + '|'.join(map(re.escape, names)) + r')\b')
10+
self.automaton = ahocorasick.Automaton()
11+
for index, name in enumerate(names):
12+
self.automaton.add_word(name, (index, name))
13+
self.automaton.make_automaton()
14+
self.name_set = set(names)
15+
self._init_trie()
16+
17+
def find_regex(self, text):
18+
return [(match.group(), match.start(), match.end()) for match in self.pattern.finditer(text)]
19+
20+
def find_ahocorasick(self,text):
21+
# occurrences = []
22+
# for end_index, (idx, name) in self.automaton.iter(text):
23+
# start_index = end_index - len(name) + 1
24+
# occurrences.append((name, start_index, end_index))
25+
# return occurrences
26+
occurrences = []
27+
for end_index, (idx, name) in self.automaton.iter(text):
28+
start_index = end_index - len(name) + 1
29+
# Check if the match is an entire word using word boundaries
30+
if (start_index == 0 or not text[start_index - 1].isalnum()) and \
31+
(end_index == len(text) - 1 or not text[end_index + 1].isalnum()):
32+
occurrences.append((name, start_index, end_index))
33+
return occurrences
34+
35+
def find_trie(self,text):
36+
firstnames = list(re.finditer(self.first_trie_regex, text, overlapped=True))
37+
lastnames = list(re.finditer(self.last_trie_regex, text, overlapped=True))
38+
return [(match.group(), match.start(), match.end()) for match in firstnames + lastnames]
39+
40+
def _init_trie(self):
41+
from src.utils.ano_regex import create_names_regex
42+
from src.utils.trie import Trie
43+
44+
with open('./data/first_names_trie_regex.pkl', 'rb') as f:
45+
self.first_trie_regex = dill.load(f)
46+
with open('./data/last_names_trie_regex.pkl', 'rb') as f:
47+
self.last_trie_regex = dill.load(f)
48+
49+
50+
def find_set(self,text):
51+
occurrences = []
52+
for match in re.finditer(r'\b\w+\b', text):
53+
word = match.group()
54+
if word in self.name_set:
55+
occurrences.append((word, match.start(), match.end()))
56+
return occurrences
57+
58+
def run_parallel(self, method, text, num_workers=4):
59+
from multiprocessing import Pool
60+
61+
# Split text into lines
62+
lines = text.splitlines(keepends=True)
63+
total_lines = len(lines)
64+
chunk_size = total_lines // num_workers
65+
66+
# Create chunks ensuring each line is entirely in one block
67+
chunks = []
68+
for i in range(num_workers):
69+
start_index = i * chunk_size
70+
end_index = (i + 1) * chunk_size if i != num_workers - 1 else total_lines
71+
chunk = ''.join(lines[start_index:end_index])
72+
chunks.append(chunk)
73+
74+
with Pool(num_workers) as pool:
75+
results = pool.map(method, chunks)
76+
return [item for sublist in results for item in sublist]
77+
78+
if __name__ == "__main__":
79+
with open('data/_all_orig.txt', 'r') as file:
80+
text = file.read()
81+
82+
with open('data/first_names.txt', 'r') as names_file:
83+
names = {line.strip() for line in names_file}
84+
85+
with open('data/last_names.txt', 'r') as names_file:
86+
lnames = {line.strip() for line in names_file}
87+
88+
names.update(lnames)
89+
if '' in names:
90+
names.remove('')
91+
92+
anon = Anon(names)
93+
94+
def write_matches(matches, file):
95+
with open(f"tmp/{file}.txt", 'w') as file:
96+
file.write('\n'.join(repr(match) for match in matches))
97+
98+
99+
matches_trie = anon.find_trie(text)
100+
print(len(matches_trie))
101+
write_matches(matches_trie, 'matches_trie')
102+
103+
104+
matches_set = anon.find_set(text)
105+
print(len(matches_set))
106+
write_matches(matches_set, 'matches_set')
107+
108+
109+
matches_regex = anon.find_regex(text)
110+
print(len(matches_regex))
111+
write_matches(matches_regex, 'matches_regex')
112+
113+
114+
matches_aho = anon.find_ahocorasick(text)
115+
print(len(matches_aho))
116+
write_matches(matches_aho, 'matches_aho')
117+
118+
119+

src/utils/anonymize_spacy.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import spacy
2+
import spacy.cli
3+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
4+
#from presidio_anonymizer import AnonymizerEngine
5+
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpEngineProvider
6+
from presidio_analyzer.predefined_recognizers import SpacyRecognizer
7+
from presidio_analyzer import PatternRecognizer
8+
9+
class Anon_Spacy:
10+
def __init__(self):
11+
languages = ['en','de','fr','it']
12+
size = "lg"
13+
gernres = {lang: "web" if lang == 'en' else "news" for lang in languages}
14+
self.models = {lang: f"{lang}_core_{gernres[lang]}_{size}" for lang in languages}
15+
self.models_loaded = []
16+
17+
def analyze_text(self, text, language='de',entities=['PERSON']):
18+
if not language in self.models:
19+
print(f"WARN: language '{language}' not supported. Supported languages are {self.models.keys()}.")
20+
return self.get_analyzer(language,entities).analyze(text=text, language=language, entities=["PERSON"])
21+
22+
def get_analyzer(self,language='de',entities=['PERSON']):
23+
self.ensure_model_loaded(self.models[language])
24+
nlp_engine = SpacyNlpEngine(models=[{"lang_code": language, "model_name": self.models[language]}])
25+
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=[language])
26+
analyzer.registry.add_recognizer(SpacyRecognizer(supported_language=language, supported_entities=entities))
27+
return analyzer
28+
29+
def ensure_model_loaded(self,model_name):
30+
if model_name in self.models_loaded:
31+
print(f"Model '{model_name}' already loaded.")
32+
return
33+
print(f"Loading model '{model_name}'.")
34+
try:
35+
# Try to load the model
36+
return spacy.load(model_name)
37+
except OSError:
38+
# If the model is not found, download it
39+
print(f"Model '{model_name}' not found. Downloading...")
40+
spacy.cli.download(model_name)
41+
print(f"Model '{model_name}' downloaded successfully.")
42+
return spacy.load(model_name)
43+
finally:
44+
self.models_loaded.append(model_name)
45+
print(f"Model '{model_name}' loaded.")
46+
47+
# Add custom recognizers if needed
48+
# Example: Adding a custom recognizer for French phone numbers
49+
# fr_phone_recognizer = PatternRecognizer(supported_entity="FR_PHONE_NUMBER",
50+
# patterns=[{"name": "FR_PHONE",
51+
# "regex": r"(\+33|0)[1-9]\d{8}",
52+
# "score": 0.9}])
53+
# analyzer.registry.add_recognizer(fr_phone_recognizer)
54+
55+
56+
# Initialize the anonymizer engine
57+
#anonymizer = AnonymizerEngine()
58+
59+
60+
# def anonymize_text(text, language):
61+
# return anonymizer.anonymize(text=text, analyzer_results=analyze_text(text,language))

0 commit comments

Comments
 (0)