From bb36561728d5dad5294e5fa4001c55f2e04f8fad Mon Sep 17 00:00:00 2001
From: qubixes <44498096+qubixes@users.noreply.github.com>
Date: Tue, 10 Oct 2023 13:15:46 +0200
Subject: [PATCH] Add cheatsheet (#9)

* Add Cheatsheet

Fix bug parsing escaped "\-" in character class.
Fix bug failing to parse "(a|)".
---
 examples/Cheatsheet.ipynb   | 474 ++++++++++++++++++++++++++++++++++++
 examples/regex.html         |   4 +-
 examples/tutorial.ipynb     |  24 +-
 regexmodel/datastructure.py |  12 +-
 regexmodel/regexclass.py    |   2 +-
 5 files changed, 496 insertions(+), 20 deletions(-)
 create mode 100644 examples/Cheatsheet.ipynb

diff --git a/examples/Cheatsheet.ipynb b/examples/Cheatsheet.ipynb
new file mode 100644
index 0000000..cdb06cf
--- /dev/null
+++ b/examples/Cheatsheet.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "aeb50a7b",
+   "metadata": {},
+   "source": [
+    "# All available regex constructions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d8fc8322",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from regexmodel import RegexModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c530177",
+   "metadata": {},
+   "source": [
+    "### Digits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "482b5ff1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Single digits: ['8', '2', '4', '2']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Single digits:\", [RegexModel(\"[0-9]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "804dcf0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Single digits with \\d: ['6', '6', '2', '5']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Single digits with \\\\d:\", [RegexModel(\"\\d\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "052ca6c7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sub range for digits (0-3):  ['1', '3', '1', '3']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Sub range for digits (0-3): \", [RegexModel(\"[0-3]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f5c6adf",
+   "metadata": {},
+   "source": [
+    "### Letters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8a228ff1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "lower case ['c', 't', 'z', 'e']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"lower case\", [RegexModel(\"[a-z]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d31647b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "upper case ['J', 'W', 'O', 'J']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"upper case\", [RegexModel(\"[A-Z]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7f429ccd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Lower and upper case: ['T', 'd', 'Q', 'K']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Lower and upper case:\", [RegexModel(\"[a-zA-Z]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1c00687a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sub ranges (d-f): ['e', 'f', 'd', 'f']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Sub ranges (d-f):\", [RegexModel(\"[d-f]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5f0124b",
+   "metadata": {},
+   "source": [
+    "### Single characters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "80b3dfe1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Always a:  ['a', 'a', 'a', 'a']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Always a: \",  [RegexModel(\"a\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7816c11f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Also always a: ['a', 'a', 'a', 'a']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Also always a:\", [RegexModel(\"[a]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d7b9e7c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a or b or c: ['a', 'a', 'b', 'a']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"a or b or c:\", [RegexModel(\"[abc]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea43f2bf",
+   "metadata": {},
+   "source": [
+    "### Multiple choice"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3ce07810",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Within a charcter class (digit or a-e): ['8', '0', '4', '4']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Within a charcter class (digit or a-e):\", [RegexModel(\"[0-9a-e]\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "9d559dad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Outside a character class (aaa or bbb): ['aaa', 'aaa', 'aaa', 'bbb']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Outside a character class (aaa or bbb):\", [RegexModel(\"(aaa|bbb)\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d0a6acec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "More than two choices (a or b or c): ['a', 'c', 'a', 'a']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"More than two choices (a or b or c):\", [RegexModel(\"(a|b|c)\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "eee80db7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Repeats change the probabilities: ['a', 'a', 'a', 'b']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Repeats change the probabilities:\", [RegexModel(\"(a|a|a|a|b)\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e697c821",
+   "metadata": {},
+   "source": [
+    "### Repeating character (classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "a0cabf11",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Repeat digit x3: ['451', '968', '921', '684']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Repeat digit x3:\", [RegexModel(\"\\d{3}\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "f1061116",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Repeat digit between 2x and 6x: ['7715', '1900', '409441', '28522']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Repeat digit between 2x and 6x:\", [RegexModel(\"\\d{2,6}\").draw() for _ in range(4)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be9ee5f1",
+   "metadata": {},
+   "source": [
+    "## API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "034703b8",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'failed': 6,\n",
+       " 'success': 0,\n",
+       " 'n_tot_char': 6,\n",
+       " 'n_char_success': 0,\n",
+       " 'n_parameters': 10,\n",
+       " 'avg_log_like_per_char': -6.907755278982137,\n",
+       " 'avg_log_like_pc_success': 0.0}"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = RegexModel.fit(10*[\"a\"]+10*[\"b\"])  # Create a model from data\n",
+    "model.regex  # Get regex\n",
+    "model.draw()  # Draw a value\n",
+    "model = RegexModel(\"\\d(a||ab)[A-Z]{3,6}\")  # Create a model from a regex\n",
+    "model_data = model.serialize()  # Serialize the model so that it can be stored in a JSON (or any other) file\n",
+    "model = RegexModel.deserialize(model_data)  # Create the model from the serialization\n",
+    "model.fit_statistics(3*[\"a\"]+3*[\"b\"])  # Get statistical information on goodness of fit and more"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cffa0ac5",
+   "metadata": {},
+   "source": [
+    "## Visualization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "bbc4847e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: When  cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.\n",
+      "regex.html\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"1000px\"\n",
+       "            height=\"1000px\"\n",
+       "            src=\"regex.html\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "            \n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7ffe71904a90>"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from regexmodel.visualization import regex_model_to_pyvis\n",
+    "\n",
+    "net = regex_model_to_pyvis(model)\n",
+    "net.show(\"regex.html\", notebook=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "216500f5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/regex.html b/examples/regex.html
index 6935827..f0e6331 100644
--- a/examples/regex.html
+++ b/examples/regex.html
@@ -88,8 +88,8 @@ <h1></h1>
                   
 
                   // parsing and collecting nodes and edges from the python
-                  nodes = new vis.DataSet([{"group": 2, "id": 0, "label": "start", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 1, "label": "[a-z]{3,19}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 2, "label": 2, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 3, "label": 3, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 18, "label": "[0-9]{2,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 4, "label": "[0-9]{2,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 5, "label": "[@]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 6, "label": "[a-z]{3,9}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 7, "label": 7, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 8, "label": "[\\-]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 10, "label": "[\\.]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 9, "label": "[a-z]{4,9}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 11, "label": 11, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 12, "label": "[c]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 15, "label": "[b-o]{1,4}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 13, "label": "[o]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 14, "label": "[m]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 16, "label": 16, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 17, "label": "[zta-z]{1,2}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 19, "label": "[@]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 20, "label": "[a-z]{10,10}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 21, "label": "[\\.]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 22, "label": "[a-z]{3,3}", "shape": "dot", "size": 10}]);
-                  edges = new vis.DataSet([{"arrows": "to", "from": 0, "to": 1, "width": 1}, {"arrows": "to", "from": 1, "to": 2, "width": 1}, {"arrows": "to", "from": 2, "to": 3, "width": 1}, {"arrows": "to", "from": 2, "to": 18, "width": 1}, {"arrows": "to", "from": 3, "to": 4, "width": 1}, {"arrows": "to", "from": 3, "to": 5, "width": 1}, {"arrows": "to", "from": 4, "to": 5, "width": 1}, {"arrows": "to", "from": 5, "to": 6, "width": 1}, {"arrows": "to", "from": 6, "to": 7, "width": 1}, {"arrows": "to", "from": 7, "to": 8, "width": 1}, {"arrows": "to", "from": 7, "to": 10, "width": 1}, {"arrows": "to", "from": 8, "to": 9, "width": 1}, {"arrows": "to", "from": 9, "to": 10, "width": 1}, {"arrows": "to", "from": 10, "to": 11, "width": 1}, {"arrows": "to", "from": 11, "to": 12, "width": 1}, {"arrows": "to", "from": 11, "to": 15, "width": 1}, {"arrows": "to", "from": 12, "to": 13, "width": 1}, {"arrows": "to", "from": 13, "to": 14, "width": 1}, {"arrows": "to", "from": 15, "to": 16, "width": 1}, {"arrows": "to", "from": 16, "to": 17, "width": 1}, {"arrows": "to", "from": 18, "to": 19, "width": 1}, {"arrows": "to", "from": 19, "to": 20, "width": 1}, {"arrows": "to", "from": 20, "to": 21, "width": 1}, {"arrows": "to", "from": 21, "to": 22, "width": 1}]);
+                  nodes = new vis.DataSet([{"group": 2, "id": 0, "label": "start", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 1, "label": "[0-9]", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 2, "label": 2, "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 3, "label": "a", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 4, "label": "a", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 6, "label": "[A-Z]{3,6}", "shape": "dot", "size": 10}, {"color": "#97c2fc", "id": 5, "label": "b", "shape": "dot", "size": 10}]);
+                  edges = new vis.DataSet([{"arrows": "to", "from": 0, "to": 1, "width": 1}, {"arrows": "to", "from": 1, "to": 2, "width": 1}, {"arrows": "to", "from": 2, "to": 3, "width": 1}, {"arrows": "to", "from": 2, "to": 4, "width": 1}, {"arrows": "to", "from": 2, "to": 6, "width": 1}, {"arrows": "to", "from": 3, "to": 6, "width": 1}, {"arrows": "to", "from": 4, "to": 5, "width": 1}, {"arrows": "to", "from": 5, "to": 6, "width": 1}]);
 
                   nodeColors = {};
                   allNodes = nodes.get({ returnType: "Object" });
diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb
index c03383a..31196fe 100644
--- a/examples/tutorial.ipynb
+++ b/examples/tutorial.ipynb
@@ -116,16 +116,16 @@
     {
      "data": {
       "text/plain": [
-       "['xpfvzarhpg89@qttvo.com',\n",
-       " 'opeju@jdabppwq.com',\n",
-       " 'wsxlwwmgydyzyf75@maeumctg.com',\n",
-       " 'vziulronmr95@fxajlcyug-skhkgitz.com',\n",
-       " 'fbexmrrvrnqvrjp93@pewij.com',\n",
-       " 'itgutzjljypmae04@oiunryvrf.com',\n",
-       " 'okzr@anm-arsgej.hnmj',\n",
-       " 'rkx70@olrsqdox.com',\n",
-       " 'yhuwulpnduybivhsxh@xpo.com',\n",
-       " 'waf@eqg-slqjkoij.com']"
+       "['brvc61@ocojvbiar.com',\n",
+       " 'hszltapze22@rewzb.com',\n",
+       " 'hwotequvt@duy.com',\n",
+       " 'dkimkedyjmzanzto60@ecm.com',\n",
+       " 'bbohcirqvmivp16@ass.com',\n",
+       " 'jdidhaidejznuxwbsep81@nwpqhuniah.dng',\n",
+       " 'huweuxbhsev83@kfbhetug-oybvj.com',\n",
+       " 'kcqpgtysdw74@bvndjdf.com',\n",
+       " 'txxbe72@ffzdykfrf.com',\n",
+       " 'xakqql@ncsdfyflv.com']"
       ]
      },
      "execution_count": 4,
@@ -158,7 +158,7 @@
     {
      "data": {
       "text/plain": [
-       "'[a-z]{3,19}((|[0-9]{2,2})[@][a-z]{3,9}(|[\\\\-][a-z]{4,9})[\\\\.]([c][o][m]|[b-o]{1,4}(|[zta-z]{1,2}))|[0-9]{2,2}[@][a-z]{10,10}[\\\\.][a-z]{3,3})'"
+       "'[a-z]{3,19}((|[0-9]{2})@[a-z]{3,9}(|\\\\-[a-z]{4,9})\\\\.(com|[b-o]{1,4}(|[zta-z]{1,2}))|[0-9]{2}@[a-z]{10}\\\\.[a-z]{3})'"
       ]
      },
      "execution_count": 5,
@@ -268,7 +268,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fcd1417d410>"
+       "<IPython.lib.display.IFrame at 0x7faadfdf17d0>"
       ]
      },
      "execution_count": 7,
diff --git a/regexmodel/datastructure.py b/regexmodel/datastructure.py
index b8cc14d..86cfaad 100644
--- a/regexmodel/datastructure.py
+++ b/regexmodel/datastructure.py
@@ -259,18 +259,20 @@ def from_string(cls,  # pylint: disable=too-many-return-statements
         # Start of an OrNode construction.
         if regex_str[0] == "(":
             all_edges = []
-            cur_regex_str = regex_str[1:]
-            while cur_regex_str[0] != ")":
-                new_edge, cur_regex_str = cls.from_string(cur_regex_str)
-                all_edges.append(new_edge)
+            cur_regex_str = regex_str
+            while True:
+                next_edge, cur_regex_str = cls.from_string(cur_regex_str[1:])
+                all_edges.append(next_edge)
                 if len(cur_regex_str) == 0:
                     raise ValueError("Unterminated ')' in regex.")
+                if cur_regex_str[0] == ")":
+                    break
             next_edge, next_str = cls.from_string(cur_regex_str[1:])
             return cls(OrNode(all_edges, next_edge), 1), next_str
 
         # Continue with another branch of the OrRegex construction
         if regex_str[0] == "|":
-            return cls(None, 1), regex_str[1:]
+            return cls(None, 1), regex_str
 
         # End of the OrNode construction
         if regex_str[0] == ")":
diff --git a/regexmodel/regexclass.py b/regexmodel/regexclass.py
index 3f8b0b9..6981a20 100644
--- a/regexmodel/regexclass.py
+++ b/regexmodel/regexclass.py
@@ -418,7 +418,7 @@ def draw_once(self):
     @classmethod
     def from_string(cls, regex_str) -> Optional[tuple[BaseRegex, str]]:
         _special_chars = [".", "+", "*", "?", "^", "$", "(", ")", "[", "]",
-                          "{", "}", "|", "\\"]
+                          "{", "}", "|", "\\", "-"]
         if len(regex_str) > 1 and regex_str[0] == "\\" and regex_str[1] in _special_chars:
             return cls([_unescape(regex_str[1])]), regex_str[2:]
         if len(regex_str) >= 1 and regex_str[0] != "\\" and regex_str[0] not in _special_chars: