From bb36561728d5dad5294e5fa4001c55f2e04f8fad Mon Sep 17 00:00:00 2001
From: qubixes <44498096+qubixes@users.noreply.github.com>
Date: Tue, 10 Oct 2023 13:15:46 +0200
Subject: [PATCH] Add cheatsheet (#9)
* Add Cheatsheet
Fix bug parsing escaped "\-" in character class.
Fix bug failing to parse "(a|)".
---
examples/Cheatsheet.ipynb | 474 ++++++++++++++++++++++++++++++++++++
examples/regex.html | 4 +-
examples/tutorial.ipynb | 24 +-
regexmodel/datastructure.py | 12 +-
regexmodel/regexclass.py | 2 +-
5 files changed, 496 insertions(+), 20 deletions(-)
create mode 100644 examples/Cheatsheet.ipynb
diff --git a/examples/Cheatsheet.ipynb b/examples/Cheatsheet.ipynb
new file mode 100644
index 0000000..cdb06cf
--- /dev/null
+++ b/examples/Cheatsheet.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "aeb50a7b",
+ "metadata": {},
+ "source": [
+ "# All available regex constructions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "d8fc8322",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from regexmodel import RegexModel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c530177",
+ "metadata": {},
+ "source": [
+ "### Digits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "482b5ff1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Single digits: ['8', '2', '4', '2']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Single digits:\", [RegexModel(\"[0-9]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "804dcf0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Single digits with \\d: ['6', '6', '2', '5']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Single digits with \\\\d:\", [RegexModel(\"\\d\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "052ca6c7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sub range for digits (0-3): ['1', '3', '1', '3']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Sub range for digits (0-3): \", [RegexModel(\"[0-3]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f5c6adf",
+ "metadata": {},
+ "source": [
+ "### Letters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "8a228ff1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "lower case ['c', 't', 'z', 'e']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"lower case\", [RegexModel(\"[a-z]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "d31647b1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "upper case ['J', 'W', 'O', 'J']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"upper case\", [RegexModel(\"[A-Z]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "7f429ccd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Lower and upper case: ['T', 'd', 'Q', 'K']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Lower and upper case:\", [RegexModel(\"[a-zA-Z]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "1c00687a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sub ranges (d-f): ['e', 'f', 'd', 'f']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Sub ranges (d-f):\", [RegexModel(\"[d-f]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a5f0124b",
+ "metadata": {},
+ "source": [
+ "### Single characters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "80b3dfe1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Always a: ['a', 'a', 'a', 'a']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Always a: \", [RegexModel(\"a\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "7816c11f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Also always a: ['a', 'a', 'a', 'a']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Also always a:\", [RegexModel(\"[a]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "d7b9e7c2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "a or b or c: ['a', 'a', 'b', 'a']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"a or b or c:\", [RegexModel(\"[abc]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ea43f2bf",
+ "metadata": {},
+ "source": [
+ "### Multiple choice"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "3ce07810",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Within a charcter class (digit or a-e): ['8', '0', '4', '4']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Within a charcter class (digit or a-e):\", [RegexModel(\"[0-9a-e]\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "9d559dad",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Outside a character class (aaa or bbb): ['aaa', 'aaa', 'aaa', 'bbb']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Outside a character class (aaa or bbb):\", [RegexModel(\"(aaa|bbb)\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "d0a6acec",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "More than two choices (a or b or c): ['a', 'c', 'a', 'a']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"More than two choices (a or b or c):\", [RegexModel(\"(a|b|c)\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "eee80db7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Repeats change the probabilities: ['a', 'a', 'a', 'b']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Repeats change the probabilities:\", [RegexModel(\"(a|a|a|a|b)\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e697c821",
+ "metadata": {},
+ "source": [
+ "### Repeating character (classes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "a0cabf11",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Repeat digit x3: ['451', '968', '921', '684']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Repeat digit x3:\", [RegexModel(\"\\d{3}\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "f1061116",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Repeat digit between 2x and 6x: ['7715', '1900', '409441', '28522']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Repeat digit between 2x and 6x:\", [RegexModel(\"\\d{2,6}\").draw() for _ in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "be9ee5f1",
+ "metadata": {},
+ "source": [
+ "## API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "034703b8",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'failed': 6,\n",
+ " 'success': 0,\n",
+ " 'n_tot_char': 6,\n",
+ " 'n_char_success': 0,\n",
+ " 'n_parameters': 10,\n",
+ " 'avg_log_like_per_char': -6.907755278982137,\n",
+ " 'avg_log_like_pc_success': 0.0}"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model = RegexModel.fit(10*[\"a\"]+10*[\"b\"]) # Create a model from data\n",
+ "model.regex # Get regex\n",
+ "model.draw() # Draw a value\n",
+ "model = RegexModel(\"\\d(a||ab)[A-Z]{3,6}\") # Create a model from a regex\n",
+ "model_data = model.serialize() # Serialize the model so that it can be stored in a JSON (or any other) file\n",
+ "model = RegexModel.deserialize(model_data) # Create the model from the serialization\n",
+ "model.fit_statistics(3*[\"a\"]+3*[\"b\"]) # Get statistical information on goodness of fit and more"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cffa0ac5",
+ "metadata": {},
+ "source": [
+ "## Visualization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "bbc4847e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
+ "regex.html\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from regexmodel.visualization import regex_model_to_pyvis\n",
+ "\n",
+ "net = regex_model_to_pyvis(model)\n",
+ "net.show(\"regex.html\", notebook=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "216500f5",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/regex.html b/examples/regex.html
index 6935827..f0e6331 100644
--- a/examples/regex.html
+++ b/examples/regex.html
@@ -88,8 +88,8 @@
// parsing and collecting nodes and edges from the python
- nodes = new vis.DataSet([{"group": 2, "id": 0, "label": "start", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 1, "label": "[a-z]{3,19}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 2, "label": 2, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 3, "label": 3, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 18, "label": "[0-9]{2,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 4, "label": "[0-9]{2,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 5, "label": "[@]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 6, "label": "[a-z]{3,9}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 7, "label": 7, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 8, "label": "[\\-]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 10, "label": "[\\.]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 9, "label": "[a-z]{4,9}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 11, "label": 11, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 12, "label": "[c]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 15, "label": "[b-o]{1,4}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 13, "label": "[o]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 14, "label": "[m]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 16, "label": 16, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 17, "label": "[zta-z]{1,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 19, "label": "[@]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 20, "label": "[a-z]{10,10}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 21, "label": "[\\.]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 22, "label": "[a-z]{3,3}", "shape": "dot", "size": 10}]);
- edges = new vis.DataSet([{"arrows": "to", "from": 0, "to": 1, "width": 1}, {"arrows": "to", "from": 1, "to": 2, "width": 1}, {"arrows": "to", "from": 2, "to": 3, "width": 1}, {"arrows": "to", "from": 2, "to": 18, "width": 1}, {"arrows": "to", "from": 3, "to": 4, "width": 1}, {"arrows": "to", "from": 3, "to": 5, "width": 1}, {"arrows": "to", "from": 4, "to": 5, "width": 1}, {"arrows": "to", "from": 5, "to": 6, "width": 1}, {"arrows": "to", "from": 6, "to": 7, "width": 1}, {"arrows": "to", "from": 7, "to": 8, "width": 1}, {"arrows": "to", "from": 7, "to": 10, "width": 1}, {"arrows": "to", "from": 8, "to": 9, "width": 1}, {"arrows": "to", "from": 9, "to": 10, "width": 1}, {"arrows": "to", "from": 10, "to": 11, "width": 1}, {"arrows": "to", "from": 11, "to": 12, "width": 1}, {"arrows": "to", "from": 11, "to": 15, "width": 1}, {"arrows": "to", "from": 12, "to": 13, "width": 1}, {"arrows": "to", "from": 13, "to": 14, "width": 1}, {"arrows": "to", "from": 15, "to": 16, "width": 1}, {"arrows": "to", "from": 16, "to": 17, "width": 1}, {"arrows": "to", "from": 18, "to": 19, "width": 1}, {"arrows": "to", "from": 19, "to": 20, "width": 1}, {"arrows": "to", "from": 20, "to": 21, "width": 1}, {"arrows": "to", "from": 21, "to": 22, "width": 1}]);
+ nodes = new vis.DataSet([{"group": 2, "id": 0, "label": "start", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 1, "label": "[0-9]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 2, "label": 2, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 3, "label": "a", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 4, "label": "a", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 6, "label": "[A-Z]{3,6}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 5, "label": "b", "shape": "dot", "size": 10}]);
+ edges = new vis.DataSet([{"arrows": "to", "from": 0, "to": 1, "width": 1}, {"arrows": "to", "from": 1, "to": 2, "width": 1}, {"arrows": "to", "from": 2, "to": 3, "width": 1}, {"arrows": "to", "from": 2, "to": 4, "width": 1}, {"arrows": "to", "from": 2, "to": 6, "width": 1}, {"arrows": "to", "from": 3, "to": 6, "width": 1}, {"arrows": "to", "from": 4, "to": 5, "width": 1}, {"arrows": "to", "from": 5, "to": 6, "width": 1}]);
nodeColors = {};
allNodes = nodes.get({ returnType: "Object" });
diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb
index c03383a..31196fe 100644
--- a/examples/tutorial.ipynb
+++ b/examples/tutorial.ipynb
@@ -116,16 +116,16 @@
{
"data": {
"text/plain": [
- "['xpfvzarhpg89@qttvo.com',\n",
- " 'opeju@jdabppwq.com',\n",
- " 'wsxlwwmgydyzyf75@maeumctg.com',\n",
- " 'vziulronmr95@fxajlcyug-skhkgitz.com',\n",
- " 'fbexmrrvrnqvrjp93@pewij.com',\n",
- " 'itgutzjljypmae04@oiunryvrf.com',\n",
- " 'okzr@anm-arsgej.hnmj',\n",
- " 'rkx70@olrsqdox.com',\n",
- " 'yhuwulpnduybivhsxh@xpo.com',\n",
- " 'waf@eqg-slqjkoij.com']"
+ "['brvc61@ocojvbiar.com',\n",
+ " 'hszltapze22@rewzb.com',\n",
+ " 'hwotequvt@duy.com',\n",
+ " 'dkimkedyjmzanzto60@ecm.com',\n",
+ " 'bbohcirqvmivp16@ass.com',\n",
+ " 'jdidhaidejznuxwbsep81@nwpqhuniah.dng',\n",
+ " 'huweuxbhsev83@kfbhetug-oybvj.com',\n",
+ " 'kcqpgtysdw74@bvndjdf.com',\n",
+ " 'txxbe72@ffzdykfrf.com',\n",
+ " 'xakqql@ncsdfyflv.com']"
]
},
"execution_count": 4,
@@ -158,7 +158,7 @@
{
"data": {
"text/plain": [
- "'[a-z]{3,19}((|[0-9]{2,2})[@][a-z]{3,9}(|[\\\\-][a-z]{4,9})[\\\\.]([c][o][m]|[b-o]{1,4}(|[zta-z]{1,2}))|[0-9]{2,2}[@][a-z]{10,10}[\\\\.][a-z]{3,3})'"
+ "'[a-z]{3,19}((|[0-9]{2})@[a-z]{3,9}(|\\\\-[a-z]{4,9})\\\\.(com|[b-o]{1,4}(|[zta-z]{1,2}))|[0-9]{2}@[a-z]{10}\\\\.[a-z]{3})'"
]
},
"execution_count": 5,
@@ -268,7 +268,7 @@
" "
],
"text/plain": [
- ""
+ ""
]
},
"execution_count": 7,
diff --git a/regexmodel/datastructure.py b/regexmodel/datastructure.py
index b8cc14d..86cfaad 100644
--- a/regexmodel/datastructure.py
+++ b/regexmodel/datastructure.py
@@ -259,18 +259,20 @@ def from_string(cls, # pylint: disable=too-many-return-statements
# Start of an OrNode construction.
if regex_str[0] == "(":
all_edges = []
- cur_regex_str = regex_str[1:]
- while cur_regex_str[0] != ")":
- new_edge, cur_regex_str = cls.from_string(cur_regex_str)
- all_edges.append(new_edge)
+ cur_regex_str = regex_str
+ while True:
+ next_edge, cur_regex_str = cls.from_string(cur_regex_str[1:])
+ all_edges.append(next_edge)
if len(cur_regex_str) == 0:
raise ValueError("Unterminated ')' in regex.")
+ if cur_regex_str[0] == ")":
+ break
next_edge, next_str = cls.from_string(cur_regex_str[1:])
return cls(OrNode(all_edges, next_edge), 1), next_str
# Continue with another branch of the OrRegex construction
if regex_str[0] == "|":
- return cls(None, 1), regex_str[1:]
+ return cls(None, 1), regex_str
# End of the OrNode construction
if regex_str[0] == ")":
diff --git a/regexmodel/regexclass.py b/regexmodel/regexclass.py
index 3f8b0b9..6981a20 100644
--- a/regexmodel/regexclass.py
+++ b/regexmodel/regexclass.py
@@ -418,7 +418,7 @@ def draw_once(self):
@classmethod
def from_string(cls, regex_str) -> Optional[tuple[BaseRegex, str]]:
_special_chars = [".", "+", "*", "?", "^", "$", "(", ")", "[", "]",
- "{", "}", "|", "\\"]
+ "{", "}", "|", "\\", "-"]
if len(regex_str) > 1 and regex_str[0] == "\\" and regex_str[1] in _special_chars:
return cls([_unescape(regex_str[1])]), regex_str[2:]
if len(regex_str) >= 1 and regex_str[0] != "\\" and regex_str[0] not in _special_chars: