BCDH
diff --git a/‎cli/__init__.py‎ b/‎cli/__init__.py‎
diff --git a/‎cli/__pycache__/__init__.cpython-310.pyc‎
128 Bytes b/‎cli/__pycache__/__init__.cpython-310.pyc‎
128 Bytes
diff --git a/‎cli/__pycache__/cli.cpython-310.pyc‎
555 Bytes b/‎cli/__pycache__/cli.cpython-310.pyc‎
555 Bytes
diff --git a/‎cli/__pycache__/typer.cpython-310.pyc‎
557 Bytes b/‎cli/__pycache__/typer.cpython-310.pyc‎
557 Bytes
diff --git a/‎cli/typer.py‎
Lines changed: 17 additions & 0 deletions b/‎cli/typer.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎notebook/.ipynb_checkpoints/cadet_notebook-checkpoint.ipynb‎
Lines changed: 379 additions & 0 deletions b/‎notebook/.ipynb_checkpoints/cadet_notebook-checkpoint.ipynb‎
Lines changed: 379 additions & 0 deletions
@@ -0,0 +1,17 @@
+import typer
+import subprocess
+app = typer.Typer()
+
+@app.command()
+def run():
+    # run FastAPI app
+    subprocess.run(["uvicorn", "app.main:app", "--reload"])
+
+@app.command()
+def notebook():
+    # run jupyter notebook
+    subprocess.run(["jupyter", "notebook","notebook/cadet_notebook.ipynb"])
+
+
+if __name__ == "__main__":
+    app()
@@ -0,0 +1,379 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wntnoij68pnW"
+      },
+      "source": [
+        "To begin, let's import spaCy and the create_object script.  This includes as `create_object()` function that will generate a generic language object in the folder `new_lang/{language_name}`.  All of the object's files are contained there."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cEbm-W2K8pnZ",
+        "outputId": "c58bbe15-1e4e-451f-bc06-03cfea70393f"
+      },
+      "outputs": [
+        {
+          "ename": "ModuleNotFoundError",
+          "evalue": "No module named 'slugify'",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+            "\u001b[0;32m<ipython-input-1-9a165742ba02>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_object\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcreate_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     15\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;32m~/projects/cadet-the-notebook/util/create_object.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msrsly\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mslugify\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mslugify\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'slugify'"
+          ]
+        }
+      ],
+      "source": [
+        "# Install needed util files if missing\n",
+        "import spacy\n",
+        "if 'google.colab' in str(get_ipython()):\n",
+        "    !mkdir util\n",
+        "    !wget -O /content/util/corpus.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/corpus.py\n",
+        "    !wget -O /content/util/create_object.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/create_object.py\n",
+        "    !wget -O /content/util/export.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/export.py\n",
+        "    !wget -O /content/util/tokenization.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/tokenization.py\n",
+        "    #colab currently uses spacy 2.2.4, need 3\n",
+        "    if '3' not in spacy.__version__[:1]:\n",
+        "        !pip install spacy --upgrade\n",
+        "\n",
+        "import spacy\n",
+        "from util.create_object import create_object\n",
+        "spacy.__version__\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9L-esYgz8pna",
+        "outputId": "52bed5ba-1a25-40e6-9276-7ab32b2317ff"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'🍈 created language object for meow'"
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "lang_name = 'Meow'\n",
+        "lang_code ='meow'\n",
+        "direction = 'ltr' #or 'rtl'\n",
+        "has_case = True\n",
+        "has_letters = True\n",
+        "\n",
+        "create_object(lang_name, lang_code, direction, has_case, has_letters)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "p85uvVJL8pna",
+        "outputId": "67338883-b4bb-42ef-f290-c829339a520b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "base_config.cfg  lex_attrs.py\t __pycache__\t      texts\r\n",
+            "corpus_json\t lookups\t setup.py\t      tokenizer_exceptions.py\r\n",
+            "examples.py\t meow.egg-info\t stop_words.py\r\n",
+            "__init__.py\t project.yml\t syntax_iterators.py\r\n",
+            "lemmatizer.py\t punctuation.py  tag_map.py\r\n"
+          ]
+        }
+      ],
+      "source": [
+        "!ls ./new_lang\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Kq4L_Zw58pnb"
+      },
+      "source": [
+        "To assess how the tokenizer defaults will work with your language, add example sentences to the [`examples.py`](./new_lang/examples.py) file.  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "F7a2etRt8pnb",
+        "outputId": "d6e158a5-efd4-4c0d-cf6b-ff3d5c085b4a"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div><span style='border: 5px solid blue; margin:5px;'>I</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>can</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>haz</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>sentenz</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>.</span>&nbsp;</div>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from IPython.core.display import HTML\n",
+        "from util.tokenization import tokenization\n",
+        "HTML(tokenization(lang_name))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cgKwdYrv8pnb"
+      },
+      "source": [
+        "To adjust the tokenizer you can add unique exceptions or regular exceptions to the [tokenizer_exceptions.py](./new_lang/tokenizer_exceptions.py) file\n",
+        "\n",
+        "- To join two tokens, add an exception `{'BIG YIKES':[{ORTH: 'BIG YIKES'}]}`\n",
+        "- To split a token in two, `{'Kummerspeck':[{ORTH:\"Kummer\"},{ORTH:\"speck\"}]}`\n",
+        "\n",
+        "Note in both cases that we add a dictionary. The key is the string to match on, with a list of tokens.  In the first case we had a single token where we would otherwise have two and vice versa. You can find more details in the spaCy documentation and [here](https://new-languages-for-nlp.github.io/course-materials/w1/tokenization.html)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9sXdiPJh8pnc"
+      },
+      "source": [
+        "## Lookups\n",
+        "\n",
+        "The `create_object()` function creates a `new_lang/lookups` directory that contains three files.  These are simple json lookups for unambiguous pos, lemma and features. You can add your data to these files and automatically update token values.  Keep in mind that you'll need to find a balance between the convenience of automatically annotating tokens and the inconvenience of having to correct machine errors.  Once you're done updating the files with your existing linguistic data, proceed to the next step."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mtrtjnz98pnc"
+      },
+      "source": [
+        "## Texts\n",
+        "\n",
+        "For us to identify frequent tokens for automatic annotation, you'll need to provide texts.  Place your machine-readable utf-8 text files in the `new_lang/texts` folder.   "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TaFHNb5B8pnc",
+        "outputId": "81d7d280-d0f5-4f83-941a-d044702ccb1f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "{'texts': 1, 'tokens': 3912, 'unique_tokens': 761}\n"
+          ]
+        }
+      ],
+      "source": [
+        "from util.corpus import make_corpus\n",
+        "\n",
+        "make_corpus(lang_name)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lQXUIUpf8pnd"
+      },
+      "source": [
+        "The output of make_corpus is a json file at [`new_lang/corpus_json/tokens.json`](./new_lang/corpus_json/tokens.json). For each token, you'll find a `text` key for the token's string as well as keys for pos_, lemma_ and ent_type_. Keep in mind that this system is not able to process ambiguous lookups.  Only enter data for tokens or spans with very little semantic variation.      "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9KC5k_388pnd",
+        "outputId": "b8db2d3b-5361-4adf-f819-45ec6a853263"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "🍉 To bulk annotate 33% of the corpus, add data to first 14 tokens\n",
+            "🍅 To bulk annotate 50% of the corpus, add data to first 37 tokens\n",
+            "🍒 To bulk annotate 66% of the corpus, add data to first 100 tokens\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "import srsly\n",
+        "from pathlib import Path\n",
+        "\n",
+        "def get_percentages():\n",
+        "    thirds = []\n",
+        "    halfs = []\n",
+        "    two_thirds = []\n",
+        "    tokens = srsly.read_json(Path.cwd() / 'new_lang' / 'corpus_json' / 'tokens.json')\n",
+        "    tokens = srsly.json_loads(tokens)\n",
+        "    for token in tokens:\n",
+        "        if token['rank'] == 1:\n",
+        "            total_tokens = token['count'] + token['remain']\n",
+        "\n",
+        "        percent_annotated = 1 - (token['remain'] / total_tokens)\n",
+        "        percent_annotated = int((percent_annotated * 100))\n",
+        "        if percent_annotated == 33:\n",
+        "            thirds.append(token)\n",
+        "        if percent_annotated == 50:\n",
+        "            halfs.append(token)\n",
+        "        if percent_annotated == 66:\n",
+        "            two_thirds.append(token)\n",
+        "    return thirds[0], halfs[0], two_thirds[0]\n",
+        "\n",
+        "    #let percent_annotated = 1 - (token.remain / total_tokens);\n",
+        "#    let percent_annotated_str = (percent_annotated*100).toFixed(0);\n",
+        "third, half, two_thirds = get_percentages()\n",
+        "print(f\"\"\"\n",
+        "🍉 To bulk annotate 33% of the corpus, add data to first {third['rank']} tokens\n",
+        "🍅 To bulk annotate 50% of the corpus, add data to first {half['rank']} tokens\n",
+        "🍒 To bulk annotate 66% of the corpus, add data to first {two_thirds['rank']} tokens\n",
+        "\"\"\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "D5xc9rw58pne"
+      },
+      "source": [
+        "Next we will export your texts and lookups in a TSV file in the CoreNLP format.  This data can then be loaded into INCEpTION for annotation work"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fRWHHE158pne",
+        "outputId": "b8f04592-3202-4a83-8fd3-5bb26a7b2055"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'saved data to file /tmp/conllu_export.zip'"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from util.export import download\n",
+        "\n",
+        "download(lang_name)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Kku6r3DX8pne"
+      },
+      "source": [
+        "When you have completed all annotation work in INCEpTION, you're ready to begin model training. This final step will export your spaCy language object. From there you can follow the spaCy documentation on model training!  \n",
+        "\n",
+        "1. package the object into a usable folder, that can be moved, and initialized using projects\n",
+        "2. nlp.to_disk(\"/tmp/checkpoint\")?\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QEOK3mAh8pne"
+      },
+      "outputs": [],
+      "source": [
+        "# Create a spaCy project file for your project.\n",
+        "from util.project import make_project\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "B7mv_5zp8pne",
+        "outputId": "78aef28a-b8af-424f-ed80-c034056737a1"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "created file /home/ajanco/projects/cadet-the-notebook/Meow.zip\n"
+          ]
+        }
+      ],
+      "source": [
+        "import shutil\n",
+        "from util.project import make_project\n",
+        "\n",
+        "new_lang = Path.cwd() / \"new_lang\"\n",
+        "make_project(lang_name,lang_code)\n",
+        "\n",
+        "#make export directory\n",
+        "export_path = Path.cwd() / lang_name\n",
+        "\n",
+        "\n",
+        "#shutil.make_archive(\"zipped_sample_directory\", \"zip\", \"sample_directory\")\n",
+        "shutil.make_archive(str(export_path), 'zip', str(new_lang))\n",
+        "zip_file = Path.cwd() / (lang_name + '.zip')\n",
+        "print(f'created file {zip_file}')"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Cadet Notebook",
+      "language": "python",
+      "name": "cadet-notebook"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.5"
+    },
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}