Skip to content

Commit

Permalink
Merge pull request #17 from baloise/main
Browse files Browse the repository at this point in the history
PR for new build
  • Loading branch information
robbizbal authored Oct 30, 2024
2 parents b1ff3b8 + a05c40e commit a423286
Show file tree
Hide file tree
Showing 14 changed files with 187 additions and 72 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ WORKDIR /app
# set script permissions
RUN chmod +x entrypoint.sh setup.sh

# create user
RUN adduser -Ds /bin/bash anon && chown -R anon: /app

# switch to user
USER anon

# run app setup script
Expand Down
Binary file added data/first_names_regex.pkl
Binary file not shown.
Binary file added data/first_names_trie_regex.pkl
Binary file not shown.
Binary file added data/last_names_regex.pkl
Binary file not shown.
Binary file added data/last_names_trie_regex.pkl
Binary file not shown.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ uvicorn = "^0"
gunicorn = "^0"
python-dateutil = "^2.9.0.post0"
tqdm = "^4.66.6"
dill = "^0.3.9"

[tool.poetry.dev-dependencies]
[tool.poetry.group.dev.dependencies]
pytest = "^6.0"
ruff = "^0.7"
ipykernel = "^6.29.5"
jupyter = "^1.1.1"
pytest-benchmark = "^5.0.1"
pyahocorasick = "^2.1.0"
polars = "^1.12.0"
flashtext2 = "^1.1.0"

[tool.pytest.ini_options]
minversion = "6.0"
Expand Down
1 change: 1 addition & 0 deletions src/static/public/github-mark.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
18 changes: 18 additions & 0 deletions src/static/scripts/downloader.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
document.getElementById('downloadBtn').addEventListener('click', function() {
// Get the content of the textarea
const foundEntities = document.getElementById('responseFieldEntities').value;

// Create a Blob from the textarea content
const blob = new Blob([foundEntities], { type: 'application/json' });

// Create a link element
const link = document.createElement('a');
link.href = URL.createObjectURL(blob);
link.download = 'yoyo-entities.json'; // Specify the file name

// Programmatically click the link to trigger the download
link.click();

// Clean up and revoke the object URL
URL.revokeObjectURL(link.href);
});
5 changes: 3 additions & 2 deletions src/static/scripts/script.js → src/static/scripts/form.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ document.getElementById('inputForm').addEventListener('submit', function(event)
})
.then(text => {
// Display the response in the textarea
document.getElementById('responseField').value = JSON.stringify(text, null, 2); // Format the JSON response
document.getElementById('responseFieldText').value = JSON.stringify(text.original_text, null, 2); // Format the JSON response
document.getElementById('responseFieldEntities').value = JSON.stringify(text.llm_entities, null, 2); // Format the JSON response
})
.catch((error) => {
console.error('Error:', error); // Handle any errors
// Optionally display the error in the textarea
document.getElementById('responseField').value = 'Error: ' + error.message;
document.getElementById('responseFieldText').value = 'Error: ' + error.message;
});
});
3 changes: 3 additions & 0 deletions src/static/styles/colors.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:root {
--background-body: rgb(240, 238, 238);
}
38 changes: 38 additions & 0 deletions src/static/styles/style.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* don't add color codes here. Use colors.css and add var(--var-name) here */
body {
display: flex;
flex-direction: column;
min-height: 100vh;
margin: 0;
}

main {
flex: 1;
background-color: var(--background-body)
}

header {
min-height: 50px;
background: var(--background-body);
text-align: center;
}

footer {
min-height: 50px;
background: var(--background-body);
text-align: center;
flex-direction: row;
}

.logo {
width: 300px;
height: 300px;
text-align: center;
}

.center-block {
display: block;
margin-left: auto;
margin-right: auto;
text-align: center;
}
47 changes: 38 additions & 9 deletions src/templates/html/form.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,49 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>YoYo MaskR</title>
<title>YoYo MaskЯ</title>
<link rel="icon" type="image/svg+xml" href="/static/public/favicon.svg">
<link rel="stylesheet" href="/static/styles/colors.css">
<link rel="stylesheet" href="/static/styles/style.css">
</head>
<body>
<h3>Input:</h3>
<form id="inputForm">
<input type="text" id="inputData" placeholder="Enter text to anonymize" required>
<input type="submit" value="Submit">
</form>
<header>
<div>
<h1>YoYo MaskЯ</h1>
<img class="logo" id="AppLogo" src="/static/public/yo-yo-maskr-logo.svg" alt="YoYo MaskR Logo" width="200" height="200">
</div>
</header>

<main>
<div class="center-block">
<h3>Input:</h3>
<form id="inputForm">
<input type="text" id="inputData" placeholder="Enter text to anonymize" required>
<input type="submit" value="Submit">
</form>
</div>

<h3>Response:</h3>
<textarea id="responseField" rows="10" cols="50" readonly></textarea>
<div class="center-block">
<h3>Initial Text:</h3>
<textarea id="responseFieldText" rows="10" cols="50" readonly></textarea>
</div>

<div class="center-block">
<h3>Entities found:</h3>
<textarea id="responseFieldEntities" rows="10" cols="50" readonly></textarea><br><br>
<button id="downloadBtn">Download</button>
</div>
</main>

<footer>
<div class="footer">
<a href="https://github.com/baloise/yo-yo-maskr">
<img src="/static/public/github-mark.svg" alt="GitHub" width="30" height="30">
</a>
</div>
</footer>
<!-- Link to the external JavaScript file -->
<script src="/static/scripts/script.js"></script>
<script src="/static/scripts/form.js"></script>
<script src="/static/scripts/downloader.js"></script>
</body>
</html>
137 changes: 77 additions & 60 deletions yoyomaskr.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,77 +10,94 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "909d787a51ea4b14b27a17a354aeb534",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/2194 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'text': 'ggsdgg\\n', 'replace_dict': {}}\n",
"{'text': 'rien à ajouter\\n', 'replace_dict': {}}\n",
"{'text': 'Alles bestens abgelaufen. Sehr freundlicher Kundenkontakt.\\n', 'replace_dict': {}}\n",
"{'text': 'Chez la bâloise on Présume coupable et fraudeur\\n', 'replace_dict': {}}\n",
"{'text': 'Schnelle unkomplizierte Erledigung\\n', 'replace_dict': {}}\n",
"{'text': 'Ich wurde in Mallorca bestohlen. Habe alles reibungslos mit dem Schadenexperten am #DATE_1# besprochen. Er war wirklich sehr kompetent und freundlich. Auch die Auszahlung war nach meiner eingereichter Aufstellung super schnell und grosszügig. Besten Dank.\\n', 'replace_dict': {'14.Mai': '#DATE_1#'}}\n",
"{'text': 'Unkomplizierte und zeitgemässe Abhandlung des Schadenfalls.\\n', 'replace_dict': {}}\n",
"{'text': 'très satisfaite de traitement de sinistre, rapide et efficace\\n', 'replace_dict': {}}\n",
"{'text': 'Kompetent, unkompliziert und immer freundlich! #NAME_2# #NAME_1# ist immer sehr hilfsbereit und man merkt, dass ihm seine Kunden sehr wichtig sind!!\\n', 'replace_dict': {'Dominik Mächler': '#NAME_1#', 'Herr': '#NAME_2#'}}\n",
"{'text': 'schnell und einfach\\n', 'replace_dict': {}}\n",
"{'text': 'toller rascher Service...\\n', 'replace_dict': {}}\n",
"{'text': '#NAME_1# société polaire et fiable\\n', 'replace_dict': {'Grande': '#NAME_1#'}}\n",
"{'text': '#NAME_1# ich mit der Baloise einfach gut finden .-)\\n', 'replace_dict': {'Weil': '#NAME_1#'}}\n",
"{'text': 'Schadenmeldung unkompliziert übers Web abgewickelt. Schelle Kontaktaufnahme seitens Versicherung. etc...\\n', 'replace_dict': {}}\n",
"{'text': 'Sehr freundliche Kundenbetreuung in der Niederlassung Chur.\\n', 'replace_dict': {}}\n",
"{'text': 'Wir haben sehr gute Geschäftsbeziehung mit Herrn #NAME_1# und Herrn #NAME_2#.\\n', 'replace_dict': {'Martin Lusser': '#NAME_1#', 'Marco Gisin': '#NAME_2#'}}\n",
"{'text': 'interraction simple et efficace. Prise en main du cas et gestion de ce dernier de bout en bout.\\n', 'replace_dict': {}}\n",
"{'text': 'super Service, unerwartete Vergütung\\n', 'replace_dict': {}}\n",
"{'text': 'No good communication about the status with me (the customer)\\n', 'replace_dict': {}}\n",
"{'text': 'Sehr kompetente und freundliche #NAME_1# der Mitarbeiter. Abwicklung hat bestens funktionniert.\\n', 'replace_dict': {'Art': '#NAME_1#'}}\n",
"{'text': 'Da die Brille profisorisch geflickt werden muste und sie, dann noch Bilder von der defekten Brille wollten. das danach fast unmöglich war. Die Brille wurde ja wieder Gebraucht, ohne Brille ist nicht sehr gut.\\n', 'replace_dict': {}}\n",
"{'text': 'erste Abwicklung gut gelaufen, jedoch relativ hohe Kosten\\n', 'replace_dict': {}}\n",
"{'text': 'einfache, zielgerichtete Abwicklung des Schadenfalls. merci\\n', 'replace_dict': {}}\n",
"{'text': 'Schadenabwicklung funktionierte tadellos mit Ersatzwagen\\n', 'replace_dict': {}}\n",
"{'text': 'Simple et rapide\\n', 'replace_dict': {}}\n",
"{'text': 'Sofortiges reagieren auf Schadenmeldung und anschliessende Erledigung\\n', 'replace_dict': {}}\n",
"{'text': 'Sehr freundlicher und effektiver Kontakt am Telefon. Reibungslose Schadensabwicklung. Wir verlassen die Baloise nur wegen der starken Kostenerhöhung. Der Service war wirklich super.\\n', 'replace_dict': {}}\n",
"{'text': 'Einfach und unkompliziert. Rasche und kompetente Abwicklung\\n', 'replace_dict': {}}\n",
"{'text': 'Kompetente und schnelle Abwicklung der Schadenfälle\\n', 'replace_dict': {}}\n",
"{'text': 'Rapport qualité prix\\n', 'replace_dict': {}}\n"
]
}
],
"outputs": [],
"source": [
"from tqdm.auto import tqdm\n",
"from src.utils.regex import anonymize_dates, anonymize_entities, create_names_regex\n",
"import dill\n",
"import regex as re\n",
"from src.utils.regex import create_names_regex\n",
"from src.utils.trie import Trie\n",
"\n",
"NAMES_FROM_PICKLE = False\n",
"USE_FULL = False\n",
"\n",
"with open('./data/first_names.txt') as f:\n",
" first_names = f.read().split('\\n')\n",
"if NAMES_FROM_PICKLE:\n",
" with open('./data/first_names_regex.pkl', 'rb') as f:\n",
" first_names = dill.load(f)\n",
" with open('./data/last_names_trie_regex.pkl', 'rb') as f:\n",
" last_names = dill.load(f)\n",
" with open('./data/first_names_trie_regex.pkl', 'rb') as f:\n",
" first_names_trie = dill.load(f)\n",
" with open('./data/last_names_trie_regex.pkl', 'rb') as f:\n",
" last_names_trie = dill.load(f)\n",
"else:\n",
" with open(f'./data/first_names{\"_full\" if USE_FULL else \"\"}.txt') as f:\n",
" first_names = [l.strip() for l in f.read().split('\\n') if len(l.strip()) > 0]\n",
" first_names_regex = create_names_regex(first_names)\n",
" with open('./data/first_names_regex.pkl', 'wb') as f:\n",
" dill.dump(first_names_regex, f)\n",
" first_trie = Trie()\n",
" for name in sorted([f for f in first_names if len(f) > 0], key=len, reverse=True):\n",
" first_trie.add(name)\n",
" first_trie_regex = re.compile(r'\\b' + first_trie.pattern() + r'\\b')\n",
" with open('./data/first_names_trie_regex.pkl', 'wb') as f:\n",
" dill.dump(first_trie_regex, f)\n",
"\n",
"with open('./data/last_names.txt') as f:\n",
" last_names = f.read().split('\\n')\n",
" with open(f'./data/last_names{\"_full\" if USE_FULL else \"\"}.txt') as f:\n",
" last_names = [l.strip() for l in f.read().split('\\n') if len(l.strip()) > 0]\n",
" last_names_regex = create_names_regex(last_names)\n",
" with open('./data/last_names_regex.pkl', 'wb') as f:\n",
" dill.dump(last_names_regex, f)\n",
" last_trie = Trie()\n",
" for name in sorted([l for l in last_names if len(l) > 0], key=len, reverse=True):\n",
" last_trie.add(name)\n",
" last_trie_regex = re.compile(r'\\b' + last_trie.pattern() + r'\\b')\n",
" with open('./data/last_names_trie_regex.pkl', 'wb') as f:\n",
" dill.dump(last_trie_regex, f)\n",
"\n",
"with open('./sample_texts.txt') as f:\n",
" sample_texts = f.readlines()\n",
" sample_texts = f.read().split('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import regex as re\n",
"from tqdm.auto import tqdm\n",
"from src.utils.regex import anonymize_entities\n",
"\n",
"last_trie_regex = re.compile(r'\\b' + last_trie.pattern() + r'\\b')\n",
"first_trie_regex = re.compile(r'\\b' + first_trie.pattern() + r'\\b')\n",
"\n",
"result = [anonymize_entities(text, by_names='NAME', first_names=first_names, last_names=last_names) for text in tqdm(sample_texts[:])]\n",
"for i in range(len(result[:30])):\n",
" print(result[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from flashtext2 import KeywordProcessor\n",
"from tqdm.auto import tqdm\n",
"\n",
"ln = KeywordProcessor(case_sensitive=True)\n",
"for name in last_names:\n",
" ln.add_keyword(name)\n",
"\n",
"fn = KeywordProcessor(case_sensitive=True)\n",
"for name in first_names:\n",
" fn.add_keyword(name)\n",
"\n",
"result = [{'text': text, 'first_names': fn.extract_keywords_with_span(text),\n",
" 'last_names': ln.extract_keywords_with_span(text)} for text in tqdm(sample_texts)]"
]
}
],
"metadata": {
Expand Down

0 comments on commit a423286

Please sign in to comment.