From bb36561728d5dad5294e5fa4001c55f2e04f8fad Mon Sep 17 00:00:00 2001 From: qubixes <44498096+qubixes@users.noreply.github.com> Date: Tue, 10 Oct 2023 13:15:46 +0200 Subject: [PATCH] Add cheatsheet (#9) * Add Cheatsheet Fix bug parsing escaped "\-" in character class. Fix bug failing to parse "(a|)". --- examples/Cheatsheet.ipynb | 474 ++++++++++++++++++++++++++++++++++++ examples/regex.html | 4 +- examples/tutorial.ipynb | 24 +- regexmodel/datastructure.py | 12 +- regexmodel/regexclass.py | 2 +- 5 files changed, 496 insertions(+), 20 deletions(-) create mode 100644 examples/Cheatsheet.ipynb diff --git a/examples/Cheatsheet.ipynb b/examples/Cheatsheet.ipynb new file mode 100644 index 0000000..cdb06cf --- /dev/null +++ b/examples/Cheatsheet.ipynb @@ -0,0 +1,474 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aeb50a7b", + "metadata": {}, + "source": [ + "# All available regex constructions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d8fc8322", + "metadata": {}, + "outputs": [], + "source": [ + "from regexmodel import RegexModel" + ] + }, + { + "cell_type": "markdown", + "id": "4c530177", + "metadata": {}, + "source": [ + "### Digits" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "482b5ff1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Single digits: ['8', '2', '4', '2']\n" + ] + } + ], + "source": [ + "print(\"Single digits:\", [RegexModel(\"[0-9]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "804dcf0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Single digits with \\d: ['6', '6', '2', '5']\n" + ] + } + ], + "source": [ + "print(\"Single digits with \\\\d:\", [RegexModel(\"\\d\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "052ca6c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sub range for digits (0-3): ['1', '3', '1', '3']\n" + ] + } + ], + "source": [ + "print(\"Sub range for digits (0-3): \", [RegexModel(\"[0-3]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "markdown", + "id": "0f5c6adf", + "metadata": {}, + "source": [ + "### Letters" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8a228ff1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lower case ['c', 't', 'z', 'e']\n" + ] + } + ], + "source": [ + "print(\"lower case\", [RegexModel(\"[a-z]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d31647b1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "upper case ['J', 'W', 'O', 'J']\n" + ] + } + ], + "source": [ + "print(\"upper case\", [RegexModel(\"[A-Z]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7f429ccd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lower and upper case: ['T', 'd', 'Q', 'K']\n" + ] + } + ], + "source": [ + "print(\"Lower and upper case:\", [RegexModel(\"[a-zA-Z]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1c00687a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sub ranges (d-f): ['e', 'f', 'd', 'f']\n" + ] + } + ], + "source": [ + "print(\"Sub ranges (d-f):\", [RegexModel(\"[d-f]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "markdown", + "id": "a5f0124b", + "metadata": {}, + "source": [ + "### Single characters" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "80b3dfe1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Always a: ['a', 'a', 'a', 'a']\n" + ] + } + ], + "source": [ + "print(\"Always a: \", [RegexModel(\"a\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7816c11f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Also always a: ['a', 'a', 'a', 'a']\n" + ] + } + ], + "source": [ + "print(\"Also always a:\", [RegexModel(\"[a]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d7b9e7c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a or b or c: ['a', 'a', 'b', 'a']\n" + ] + } + ], + "source": [ + "print(\"a or b or c:\", [RegexModel(\"[abc]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "markdown", + "id": "ea43f2bf", + "metadata": {}, + "source": [ + "### Multiple choice" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3ce07810", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Within a charcter class (digit or a-e): ['8', '0', '4', '4']\n" + ] + } + ], + "source": [ + "print(\"Within a charcter class (digit or a-e):\", [RegexModel(\"[0-9a-e]\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9d559dad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outside a character class (aaa or bbb): ['aaa', 'aaa', 'aaa', 'bbb']\n" + ] + } + ], + "source": [ + "print(\"Outside a character class (aaa or bbb):\", [RegexModel(\"(aaa|bbb)\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d0a6acec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "More than two choices (a or b or c): ['a', 'c', 'a', 'a']\n" + ] + } + ], + "source": [ + "print(\"More than two choices (a or b or c):\", [RegexModel(\"(a|b|c)\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "eee80db7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repeats change the probabilities: ['a', 'a', 'a', 'b']\n" + ] + } + ], + "source": [ + "print(\"Repeats change the probabilities:\", [RegexModel(\"(a|a|a|a|b)\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "markdown", + "id": "e697c821", + "metadata": {}, + "source": [ + "### Repeating character (classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a0cabf11", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repeat digit x3: ['451', '968', '921', '684']\n" + ] + } + ], + "source": [ + "print(\"Repeat digit x3:\", [RegexModel(\"\\d{3}\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f1061116", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Repeat digit between 2x and 6x: ['7715', '1900', '409441', '28522']\n" + ] + } + ], + "source": [ + "print(\"Repeat digit between 2x and 6x:\", [RegexModel(\"\\d{2,6}\").draw() for _ in range(4)])" + ] + }, + { + "cell_type": "markdown", + "id": "be9ee5f1", + "metadata": {}, + "source": [ + "## API" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "034703b8", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'failed': 6,\n", + " 'success': 0,\n", + " 'n_tot_char': 6,\n", + " 'n_char_success': 0,\n", + " 'n_parameters': 10,\n", + " 'avg_log_like_per_char': -6.907755278982137,\n", + " 'avg_log_like_pc_success': 0.0}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = RegexModel.fit(10*[\"a\"]+10*[\"b\"]) # Create a model from data\n", + "model.regex # Get regex\n", + "model.draw() # Draw a value\n", + "model = RegexModel(\"\\d(a||ab)[A-Z]{3,6}\") # Create a model from a regex\n", + "model_data = model.serialize() # Serialize the model so that it can be stored in a JSON (or any other) file\n", + "model = RegexModel.deserialize(model_data) # Create the model from the serialization\n", + "model.fit_statistics(3*[\"a\"]+3*[\"b\"]) # Get statistical information on goodness of fit and more" + ] + }, + { + "cell_type": "markdown", + "id": "cffa0ac5", + "metadata": {}, + "source": [ + "## Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bbc4847e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n", + "regex.html\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from regexmodel.visualization import regex_model_to_pyvis\n", + "\n", + "net = regex_model_to_pyvis(model)\n", + "net.show(\"regex.html\", notebook=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "216500f5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/regex.html b/examples/regex.html index 6935827..f0e6331 100644 --- a/examples/regex.html +++ b/examples/regex.html @@ -88,8 +88,8 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"group": 2, "id": 0, "label": "start", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 1, "label": "[a-z]{3,19}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 2, "label": 2, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 3, "label": 3, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 18, "label": "[0-9]{2,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 4, "label": "[0-9]{2,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 5, "label": "[@]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 6, "label": "[a-z]{3,9}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 7, "label": 7, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 8, "label": "[\\-]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 10, "label": "[\\.]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 9, "label": "[a-z]{4,9}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 11, "label": 11, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 12, "label": "[c]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 15, "label": "[b-o]{1,4}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 13, "label": "[o]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 14, "label": "[m]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 16, "label": 16, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 17, "label": "[zta-z]{1,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 19, "label": "[@]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 20, "label": "[a-z]{10,10}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 21, "label": "[\\.]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 22, "label": "[a-z]{3,3}", "shape": "dot", "size": 10}]); - edges = new vis.DataSet([{"arrows": "to", "from": 0, "to": 1, "width": 1}, {"arrows": "to", "from": 1, "to": 2, "width": 1}, {"arrows": "to", "from": 2, "to": 3, "width": 1}, {"arrows": "to", "from": 2, "to": 18, "width": 1}, {"arrows": "to", "from": 3, "to": 4, "width": 1}, {"arrows": "to", "from": 3, "to": 5, "width": 1}, {"arrows": "to", "from": 4, "to": 5, "width": 1}, {"arrows": "to", "from": 5, "to": 6, "width": 1}, {"arrows": "to", "from": 6, "to": 7, "width": 1}, {"arrows": "to", "from": 7, "to": 8, "width": 1}, {"arrows": "to", "from": 7, "to": 10, "width": 1}, {"arrows": "to", "from": 8, "to": 9, "width": 1}, {"arrows": "to", "from": 9, "to": 10, "width": 1}, {"arrows": "to", "from": 10, "to": 11, "width": 1}, {"arrows": "to", "from": 11, "to": 12, "width": 1}, {"arrows": "to", "from": 11, "to": 15, "width": 1}, {"arrows": "to", "from": 12, "to": 13, "width": 1}, {"arrows": "to", "from": 13, "to": 14, "width": 1}, {"arrows": "to", "from": 15, "to": 16, "width": 1}, {"arrows": "to", "from": 16, "to": 17, "width": 1}, {"arrows": "to", "from": 18, "to": 19, "width": 1}, {"arrows": "to", "from": 19, "to": 20, "width": 1}, {"arrows": "to", "from": 20, "to": 21, "width": 1}, {"arrows": "to", "from": 21, "to": 22, "width": 1}]); + nodes = new vis.DataSet([{"group": 2, "id": 0, "label": "start", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 1, "label": "[0-9]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 2, "label": 2, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 3, "label": "a", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 4, "label": "a", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 6, "label": "[A-Z]{3,6}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 5, "label": "b", "shape": "dot", "size": 10}]); + edges = new vis.DataSet([{"arrows": "to", "from": 0, "to": 1, "width": 1}, {"arrows": "to", "from": 1, "to": 2, "width": 1}, {"arrows": "to", "from": 2, "to": 3, "width": 1}, {"arrows": "to", "from": 2, "to": 4, "width": 1}, {"arrows": "to", "from": 2, "to": 6, "width": 1}, {"arrows": "to", "from": 3, "to": 6, "width": 1}, {"arrows": "to", "from": 4, "to": 5, "width": 1}, {"arrows": "to", "from": 5, "to": 6, "width": 1}]); nodeColors = {}; allNodes = nodes.get({ returnType: "Object" }); diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb index c03383a..31196fe 100644 --- a/examples/tutorial.ipynb +++ b/examples/tutorial.ipynb @@ -116,16 +116,16 @@ { "data": { "text/plain": [ - "['xpfvzarhpg89@qttvo.com',\n", - " 'opeju@jdabppwq.com',\n", - " 'wsxlwwmgydyzyf75@maeumctg.com',\n", - " 'vziulronmr95@fxajlcyug-skhkgitz.com',\n", - " 'fbexmrrvrnqvrjp93@pewij.com',\n", - " 'itgutzjljypmae04@oiunryvrf.com',\n", - " 'okzr@anm-arsgej.hnmj',\n", - " 'rkx70@olrsqdox.com',\n", - " 'yhuwulpnduybivhsxh@xpo.com',\n", - " 'waf@eqg-slqjkoij.com']" + "['brvc61@ocojvbiar.com',\n", + " 'hszltapze22@rewzb.com',\n", + " 'hwotequvt@duy.com',\n", + " 'dkimkedyjmzanzto60@ecm.com',\n", + " 'bbohcirqvmivp16@ass.com',\n", + " 'jdidhaidejznuxwbsep81@nwpqhuniah.dng',\n", + " 'huweuxbhsev83@kfbhetug-oybvj.com',\n", + " 'kcqpgtysdw74@bvndjdf.com',\n", + " 'txxbe72@ffzdykfrf.com',\n", + " 'xakqql@ncsdfyflv.com']" ] }, "execution_count": 4, @@ -158,7 +158,7 @@ { "data": { "text/plain": [ - "'[a-z]{3,19}((|[0-9]{2,2})[@][a-z]{3,9}(|[\\\\-][a-z]{4,9})[\\\\.]([c][o][m]|[b-o]{1,4}(|[zta-z]{1,2}))|[0-9]{2,2}[@][a-z]{10,10}[\\\\.][a-z]{3,3})'" + "'[a-z]{3,19}((|[0-9]{2})@[a-z]{3,9}(|\\\\-[a-z]{4,9})\\\\.(com|[b-o]{1,4}(|[zta-z]{1,2}))|[0-9]{2}@[a-z]{10}\\\\.[a-z]{3})'" ] }, "execution_count": 5, @@ -268,7 +268,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 7, diff --git a/regexmodel/datastructure.py b/regexmodel/datastructure.py index b8cc14d..86cfaad 100644 --- a/regexmodel/datastructure.py +++ b/regexmodel/datastructure.py @@ -259,18 +259,20 @@ def from_string(cls, # pylint: disable=too-many-return-statements # Start of an OrNode construction. if regex_str[0] == "(": all_edges = [] - cur_regex_str = regex_str[1:] - while cur_regex_str[0] != ")": - new_edge, cur_regex_str = cls.from_string(cur_regex_str) - all_edges.append(new_edge) + cur_regex_str = regex_str + while True: + next_edge, cur_regex_str = cls.from_string(cur_regex_str[1:]) + all_edges.append(next_edge) if len(cur_regex_str) == 0: raise ValueError("Unterminated ')' in regex.") + if cur_regex_str[0] == ")": + break next_edge, next_str = cls.from_string(cur_regex_str[1:]) return cls(OrNode(all_edges, next_edge), 1), next_str # Continue with another branch of the OrRegex construction if regex_str[0] == "|": - return cls(None, 1), regex_str[1:] + return cls(None, 1), regex_str # End of the OrNode construction if regex_str[0] == ")": diff --git a/regexmodel/regexclass.py b/regexmodel/regexclass.py index 3f8b0b9..6981a20 100644 --- a/regexmodel/regexclass.py +++ b/regexmodel/regexclass.py @@ -418,7 +418,7 @@ def draw_once(self): @classmethod def from_string(cls, regex_str) -> Optional[tuple[BaseRegex, str]]: _special_chars = [".", "+", "*", "?", "^", "$", "(", ")", "[", "]", - "{", "}", "|", "\\"] + "{", "}", "|", "\\", "-"] if len(regex_str) > 1 and regex_str[0] == "\\" and regex_str[1] in _special_chars: return cls([_unescape(regex_str[1])]), regex_str[2:] if len(regex_str) >= 1 and regex_str[0] != "\\" and regex_str[0] not in _special_chars: