diff --git a/Dockerfile b/Dockerfile index 72cb7aa..879b172 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,7 +33,10 @@ WORKDIR /app # set script permissions RUN chmod +x entrypoint.sh setup.sh +# create user RUN adduser -Ds /bin/bash anon && chown -R anon: /app + +# switch to user USER anon # run app setup script diff --git a/data/first_names_regex.pkl b/data/first_names_regex.pkl new file mode 100644 index 0000000..b78f861 Binary files /dev/null and b/data/first_names_regex.pkl differ diff --git a/data/first_names_trie_regex.pkl b/data/first_names_trie_regex.pkl new file mode 100644 index 0000000..10fa76e Binary files /dev/null and b/data/first_names_trie_regex.pkl differ diff --git a/data/last_names_regex.pkl b/data/last_names_regex.pkl new file mode 100644 index 0000000..5ada401 Binary files /dev/null and b/data/last_names_regex.pkl differ diff --git a/data/last_names_trie_regex.pkl b/data/last_names_trie_regex.pkl new file mode 100644 index 0000000..39ca83a Binary files /dev/null and b/data/last_names_trie_regex.pkl differ diff --git a/pyproject.toml b/pyproject.toml index d633bb9..219323f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,12 +25,17 @@ uvicorn = "^0" gunicorn = "^0" python-dateutil = "^2.9.0.post0" tqdm = "^4.66.6" +dill = "^0.3.9" -[tool.poetry.dev-dependencies] +[tool.poetry.group.dev.dependencies] pytest = "^6.0" ruff = "^0.7" ipykernel = "^6.29.5" jupyter = "^1.1.1" +pytest-benchmark = "^5.0.1" +pyahocorasick = "^2.1.0" +polars = "^1.12.0" +flashtext2 = "^1.1.0" [tool.pytest.ini_options] minversion = "6.0" diff --git a/src/static/public/github-mark.svg b/src/static/public/github-mark.svg new file mode 100644 index 0000000..37fa923 --- /dev/null +++ b/src/static/public/github-mark.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/yo-yo-maskr-logo.svg b/src/static/public/yo-yo-maskr-logo.svg similarity index 100% rename from yo-yo-maskr-logo.svg rename to src/static/public/yo-yo-maskr-logo.svg diff --git a/src/static/scripts/downloader.js b/src/static/scripts/downloader.js new file mode 100644 index 0000000..3c84ccb --- /dev/null +++ b/src/static/scripts/downloader.js @@ -0,0 +1,18 @@ +document.getElementById('downloadBtn').addEventListener('click', function() { + // Get the content of the textarea + const foundEntities = document.getElementById('responseFieldEntities').value; + + // Create a Blob from the textarea content + const blob = new Blob([foundEntities], { type: 'application/json' }); + + // Create a link element + const link = document.createElement('a'); + link.href = URL.createObjectURL(blob); + link.download = 'yoyo-entities.json'; // Specify the file name + + // Programmatically click the link to trigger the download + link.click(); + + // Clean up and revoke the object URL + URL.revokeObjectURL(link.href); +}); diff --git a/src/static/scripts/script.js b/src/static/scripts/form.js similarity index 73% rename from src/static/scripts/script.js rename to src/static/scripts/form.js index b42b82e..be17906 100644 --- a/src/static/scripts/script.js +++ b/src/static/scripts/form.js @@ -22,11 +22,12 @@ document.getElementById('inputForm').addEventListener('submit', function(event) }) .then(text => { // Display the response in the textarea - document.getElementById('responseField').value = JSON.stringify(text, null, 2); // Format the JSON response + document.getElementById('responseFieldText').value = JSON.stringify(text.original_text, null, 2); // Format the JSON response + document.getElementById('responseFieldEntities').value = JSON.stringify(text.llm_entities, null, 2); // Format the JSON response }) .catch((error) => { console.error('Error:', error); // Handle any errors // Optionally display the error in the textarea - document.getElementById('responseField').value = 'Error: ' + error.message; + document.getElementById('responseFieldText').value = 'Error: ' + error.message; }); }); \ No newline at end of file diff --git a/src/static/styles/colors.css b/src/static/styles/colors.css new file mode 100644 index 0000000..064fb15 --- /dev/null +++ b/src/static/styles/colors.css @@ -0,0 +1,3 @@ +:root { + --background-body: rgb(240, 238, 238); +} \ No newline at end of file diff --git a/src/static/styles/style.css b/src/static/styles/style.css new file mode 100644 index 0000000..71c1aa9 --- /dev/null +++ b/src/static/styles/style.css @@ -0,0 +1,38 @@ +/* don't add color codes here. Use colors.css and add var(--var-name) here */ +body { + display: flex; + flex-direction: column; + min-height: 100vh; + margin: 0; +} + +main { + flex: 1; + background-color: var(--background-body) +} + +header { + min-height: 50px; + background: var(--background-body); + text-align: center; +} + +footer { + min-height: 50px; + background: var(--background-body); + text-align: center; + flex-direction: row; +} + +.logo { + width: 300px; + height: 300px; + text-align: center; +} + +.center-block { + display: block; + margin-left: auto; + margin-right: auto; + text-align: center; +} \ No newline at end of file diff --git a/src/templates/html/form.html b/src/templates/html/form.html index 57075be..0806476 100644 --- a/src/templates/html/form.html +++ b/src/templates/html/form.html @@ -3,20 +3,49 @@ - YoYo MaskR + YoYo MaskЯ + + -

Input:

-
- - -
+
+
+

YoYo MaskЯ

+ +
+
+ +
+
+

Input:

+
+ + +
+
-

Response:

- +
+

Initial Text:

+ +
+
+

Entities found:

+

+ +
+
+ + - + + diff --git a/yoyomaskr.ipynb b/yoyomaskr.ipynb index eb73d44..6a316f8 100644 --- a/yoyomaskr.ipynb +++ b/yoyomaskr.ipynb @@ -10,77 +10,94 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "909d787a51ea4b14b27a17a354aeb534", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/2194 [00:00 0]\n", + " first_names_regex = create_names_regex(first_names)\n", + " with open('./data/first_names_regex.pkl', 'wb') as f:\n", + " dill.dump(first_names_regex, f)\n", + " first_trie = Trie()\n", + " for name in sorted([f for f in first_names if len(f) > 0], key=len, reverse=True):\n", + " first_trie.add(name)\n", + " first_trie_regex = re.compile(r'\\b' + first_trie.pattern() + r'\\b')\n", + " with open('./data/first_names_trie_regex.pkl', 'wb') as f:\n", + " dill.dump(first_trie_regex, f)\n", "\n", - "with open('./data/last_names.txt') as f:\n", - " last_names = f.read().split('\\n')\n", + " with open(f'./data/last_names{\"_full\" if USE_FULL else \"\"}.txt') as f:\n", + " last_names = [l.strip() for l in f.read().split('\\n') if len(l.strip()) > 0]\n", + " last_names_regex = create_names_regex(last_names)\n", + " with open('./data/last_names_regex.pkl', 'wb') as f:\n", + " dill.dump(last_names_regex, f)\n", + " last_trie = Trie()\n", + " for name in sorted([l for l in last_names if len(l) > 0], key=len, reverse=True):\n", + " last_trie.add(name)\n", + " last_trie_regex = re.compile(r'\\b' + last_trie.pattern() + r'\\b')\n", + " with open('./data/last_names_trie_regex.pkl', 'wb') as f:\n", + " dill.dump(last_trie_regex, f)\n", "\n", "with open('./sample_texts.txt') as f:\n", - " sample_texts = f.readlines()\n", + " sample_texts = f.read().split('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import regex as re\n", + "from tqdm.auto import tqdm\n", + "from src.utils.regex import anonymize_entities\n", + "\n", + "last_trie_regex = re.compile(r'\\b' + last_trie.pattern() + r'\\b')\n", + "first_trie_regex = re.compile(r'\\b' + first_trie.pattern() + r'\\b')\n", "\n", "result = [anonymize_entities(text, by_names='NAME', first_names=first_names, last_names=last_names) for text in tqdm(sample_texts[:])]\n", "for i in range(len(result[:30])):\n", " print(result[i])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flashtext2 import KeywordProcessor\n", + "from tqdm.auto import tqdm\n", + "\n", + "ln = KeywordProcessor(case_sensitive=True)\n", + "for name in last_names:\n", + " ln.add_keyword(name)\n", + "\n", + "fn = KeywordProcessor(case_sensitive=True)\n", + "for name in first_names:\n", + " fn.add_keyword(name)\n", + "\n", + "result = [{'text': text, 'first_names': fn.extract_keywords_with_span(text),\n", + " 'last_names': ln.extract_keywords_with_span(text)} for text in tqdm(sample_texts)]" + ] } ], "metadata": {