Skip to content

Commit 885bf59

Browse files
committed
add cli
1 parent 76894a5 commit 885bf59

File tree

9 files changed

+1920
-74
lines changed

9 files changed

+1920
-74
lines changed

cli/__init__.py

Whitespace-only changes.
128 Bytes
Binary file not shown.
555 Bytes
Binary file not shown.
557 Bytes
Binary file not shown.

cli/typer.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import typer
2+
import subprocess
3+
app = typer.Typer()
4+
5+
@app.command()
6+
def run():
7+
# run FastAPI app
8+
subprocess.run(["uvicorn", "app.main:app", "--reload"])
9+
10+
@app.command()
11+
def notebook():
12+
# run jupyter notebook
13+
subprocess.run(["jupyter", "notebook","notebook/cadet_notebook.ipynb"])
14+
15+
16+
if __name__ == "__main__":
17+
app()
Lines changed: 379 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,379 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "wntnoij68pnW"
7+
},
8+
"source": [
9+
"To begin, let's import spaCy and the create_object script. This includes as `create_object()` function that will generate a generic language object in the folder `new_lang/{language_name}`. All of the object's files are contained there."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {
16+
"id": "cEbm-W2K8pnZ",
17+
"outputId": "c58bbe15-1e4e-451f-bc06-03cfea70393f"
18+
},
19+
"outputs": [
20+
{
21+
"ename": "ModuleNotFoundError",
22+
"evalue": "No module named 'slugify'",
23+
"output_type": "error",
24+
"traceback": [
25+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
26+
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
27+
"\u001b[0;32m<ipython-input-1-9a165742ba02>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_object\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcreate_object\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__version__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
28+
"\u001b[0;32m~/projects/cadet-the-notebook/util/create_object.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msrsly\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mslugify\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mslugify\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
29+
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'slugify'"
30+
]
31+
}
32+
],
33+
"source": [
34+
"# Install needed util files if missing\n",
35+
"import spacy\n",
36+
"if 'google.colab' in str(get_ipython()):\n",
37+
" !mkdir util\n",
38+
" !wget -O /content/util/corpus.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/corpus.py\n",
39+
" !wget -O /content/util/create_object.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/create_object.py\n",
40+
" !wget -O /content/util/export.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/export.py\n",
41+
" !wget -O /content/util/tokenization.py https://raw.githubusercontent.com/New-Languages-for-NLP/cadet-the-notebook/main/util/tokenization.py\n",
42+
" #colab currently uses spacy 2.2.4, need 3\n",
43+
" if '3' not in spacy.__version__[:1]:\n",
44+
" !pip install spacy --upgrade\n",
45+
"\n",
46+
"import spacy\n",
47+
"from util.create_object import create_object\n",
48+
"spacy.__version__\n",
49+
"\n"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": null,
55+
"metadata": {
56+
"id": "9L-esYgz8pna",
57+
"outputId": "52bed5ba-1a25-40e6-9276-7ab32b2317ff"
58+
},
59+
"outputs": [
60+
{
61+
"data": {
62+
"text/plain": [
63+
"'🍈 created language object for meow'"
64+
]
65+
},
66+
"execution_count": 2,
67+
"metadata": {},
68+
"output_type": "execute_result"
69+
}
70+
],
71+
"source": [
72+
"lang_name = 'Meow'\n",
73+
"lang_code ='meow'\n",
74+
"direction = 'ltr' #or 'rtl'\n",
75+
"has_case = True\n",
76+
"has_letters = True\n",
77+
"\n",
78+
"create_object(lang_name, lang_code, direction, has_case, has_letters)"
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": null,
84+
"metadata": {
85+
"id": "p85uvVJL8pna",
86+
"outputId": "67338883-b4bb-42ef-f290-c829339a520b"
87+
},
88+
"outputs": [
89+
{
90+
"name": "stdout",
91+
"output_type": "stream",
92+
"text": [
93+
"base_config.cfg lex_attrs.py\t __pycache__\t texts\r\n",
94+
"corpus_json\t lookups\t setup.py\t tokenizer_exceptions.py\r\n",
95+
"examples.py\t meow.egg-info\t stop_words.py\r\n",
96+
"__init__.py\t project.yml\t syntax_iterators.py\r\n",
97+
"lemmatizer.py\t punctuation.py tag_map.py\r\n"
98+
]
99+
}
100+
],
101+
"source": [
102+
"!ls ./new_lang\n"
103+
]
104+
},
105+
{
106+
"cell_type": "markdown",
107+
"metadata": {
108+
"id": "Kq4L_Zw58pnb"
109+
},
110+
"source": [
111+
"To assess how the tokenizer defaults will work with your language, add example sentences to the [`examples.py`](./new_lang/examples.py) file. "
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": null,
117+
"metadata": {
118+
"id": "F7a2etRt8pnb",
119+
"outputId": "d6e158a5-efd4-4c0d-cf6b-ff3d5c085b4a"
120+
},
121+
"outputs": [
122+
{
123+
"data": {
124+
"text/html": [
125+
"<div><span style='border: 5px solid blue; margin:5px;'>I</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>can</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>haz</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>sentenz</span>&nbsp;<span style='border: 5px solid blue; margin:5px;'>.</span>&nbsp;</div>"
126+
],
127+
"text/plain": [
128+
"<IPython.core.display.HTML object>"
129+
]
130+
},
131+
"execution_count": 4,
132+
"metadata": {},
133+
"output_type": "execute_result"
134+
}
135+
],
136+
"source": [
137+
"from IPython.core.display import HTML\n",
138+
"from util.tokenization import tokenization\n",
139+
"HTML(tokenization(lang_name))"
140+
]
141+
},
142+
{
143+
"cell_type": "markdown",
144+
"metadata": {
145+
"id": "cgKwdYrv8pnb"
146+
},
147+
"source": [
148+
"To adjust the tokenizer you can add unique exceptions or regular exceptions to the [tokenizer_exceptions.py](./new_lang/tokenizer_exceptions.py) file\n",
149+
"\n",
150+
"- To join two tokens, add an exception `{'BIG YIKES':[{ORTH: 'BIG YIKES'}]}`\n",
151+
"- To split a token in two, `{'Kummerspeck':[{ORTH:\"Kummer\"},{ORTH:\"speck\"}]}`\n",
152+
"\n",
153+
"Note in both cases that we add a dictionary. The key is the string to match on, with a list of tokens. In the first case we had a single token where we would otherwise have two and vice versa. You can find more details in the spaCy documentation and [here](https://new-languages-for-nlp.github.io/course-materials/w1/tokenization.html)."
154+
]
155+
},
156+
{
157+
"cell_type": "markdown",
158+
"metadata": {
159+
"id": "9sXdiPJh8pnc"
160+
},
161+
"source": [
162+
"## Lookups\n",
163+
"\n",
164+
"The `create_object()` function creates a `new_lang/lookups` directory that contains three files. These are simple json lookups for unambiguous pos, lemma and features. You can add your data to these files and automatically update token values. Keep in mind that you'll need to find a balance between the convenience of automatically annotating tokens and the inconvenience of having to correct machine errors. Once you're done updating the files with your existing linguistic data, proceed to the next step."
165+
]
166+
},
167+
{
168+
"cell_type": "markdown",
169+
"metadata": {
170+
"id": "mtrtjnz98pnc"
171+
},
172+
"source": [
173+
"## Texts\n",
174+
"\n",
175+
"For us to identify frequent tokens for automatic annotation, you'll need to provide texts. Place your machine-readable utf-8 text files in the `new_lang/texts` folder. "
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"metadata": {
182+
"id": "TaFHNb5B8pnc",
183+
"outputId": "81d7d280-d0f5-4f83-941a-d044702ccb1f"
184+
},
185+
"outputs": [
186+
{
187+
"name": "stdout",
188+
"output_type": "stream",
189+
"text": [
190+
"{'texts': 1, 'tokens': 3912, 'unique_tokens': 761}\n"
191+
]
192+
}
193+
],
194+
"source": [
195+
"from util.corpus import make_corpus\n",
196+
"\n",
197+
"make_corpus(lang_name)"
198+
]
199+
},
200+
{
201+
"cell_type": "markdown",
202+
"metadata": {
203+
"id": "lQXUIUpf8pnd"
204+
},
205+
"source": [
206+
"The output of make_corpus is a json file at [`new_lang/corpus_json/tokens.json`](./new_lang/corpus_json/tokens.json). For each token, you'll find a `text` key for the token's string as well as keys for pos_, lemma_ and ent_type_. Keep in mind that this system is not able to process ambiguous lookups. Only enter data for tokens or spans with very little semantic variation. "
207+
]
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": null,
212+
"metadata": {
213+
"id": "9KC5k_388pnd",
214+
"outputId": "b8db2d3b-5361-4adf-f819-45ec6a853263"
215+
},
216+
"outputs": [
217+
{
218+
"name": "stdout",
219+
"output_type": "stream",
220+
"text": [
221+
"\n",
222+
"🍉 To bulk annotate 33% of the corpus, add data to first 14 tokens\n",
223+
"🍅 To bulk annotate 50% of the corpus, add data to first 37 tokens\n",
224+
"🍒 To bulk annotate 66% of the corpus, add data to first 100 tokens\n",
225+
"\n"
226+
]
227+
}
228+
],
229+
"source": [
230+
"import srsly\n",
231+
"from pathlib import Path\n",
232+
"\n",
233+
"def get_percentages():\n",
234+
" thirds = []\n",
235+
" halfs = []\n",
236+
" two_thirds = []\n",
237+
" tokens = srsly.read_json(Path.cwd() / 'new_lang' / 'corpus_json' / 'tokens.json')\n",
238+
" tokens = srsly.json_loads(tokens)\n",
239+
" for token in tokens:\n",
240+
" if token['rank'] == 1:\n",
241+
" total_tokens = token['count'] + token['remain']\n",
242+
"\n",
243+
" percent_annotated = 1 - (token['remain'] / total_tokens)\n",
244+
" percent_annotated = int((percent_annotated * 100))\n",
245+
" if percent_annotated == 33:\n",
246+
" thirds.append(token)\n",
247+
" if percent_annotated == 50:\n",
248+
" halfs.append(token)\n",
249+
" if percent_annotated == 66:\n",
250+
" two_thirds.append(token)\n",
251+
" return thirds[0], halfs[0], two_thirds[0]\n",
252+
"\n",
253+
" #let percent_annotated = 1 - (token.remain / total_tokens);\n",
254+
"# let percent_annotated_str = (percent_annotated*100).toFixed(0);\n",
255+
"third, half, two_thirds = get_percentages()\n",
256+
"print(f\"\"\"\n",
257+
"🍉 To bulk annotate 33% of the corpus, add data to first {third['rank']} tokens\n",
258+
"🍅 To bulk annotate 50% of the corpus, add data to first {half['rank']} tokens\n",
259+
"🍒 To bulk annotate 66% of the corpus, add data to first {two_thirds['rank']} tokens\n",
260+
"\"\"\")"
261+
]
262+
},
263+
{
264+
"cell_type": "markdown",
265+
"metadata": {
266+
"id": "D5xc9rw58pne"
267+
},
268+
"source": [
269+
"Next we will export your texts and lookups in a TSV file in the CoreNLP format. This data can then be loaded into INCEpTION for annotation work"
270+
]
271+
},
272+
{
273+
"cell_type": "code",
274+
"execution_count": null,
275+
"metadata": {
276+
"id": "fRWHHE158pne",
277+
"outputId": "b8f04592-3202-4a83-8fd3-5bb26a7b2055"
278+
},
279+
"outputs": [
280+
{
281+
"data": {
282+
"text/plain": [
283+
"'saved data to file /tmp/conllu_export.zip'"
284+
]
285+
},
286+
"execution_count": 7,
287+
"metadata": {},
288+
"output_type": "execute_result"
289+
}
290+
],
291+
"source": [
292+
"from util.export import download\n",
293+
"\n",
294+
"download(lang_name)\n"
295+
]
296+
},
297+
{
298+
"cell_type": "markdown",
299+
"metadata": {
300+
"id": "Kku6r3DX8pne"
301+
},
302+
"source": [
303+
"When you have completed all annotation work in INCEpTION, you're ready to begin model training. This final step will export your spaCy language object. From there you can follow the spaCy documentation on model training! \n",
304+
"\n",
305+
"1. package the object into a usable folder, that can be moved, and initialized using projects\n",
306+
"2. nlp.to_disk(\"/tmp/checkpoint\")?\n"
307+
]
308+
},
309+
{
310+
"cell_type": "code",
311+
"execution_count": null,
312+
"metadata": {
313+
"id": "QEOK3mAh8pne"
314+
},
315+
"outputs": [],
316+
"source": [
317+
"# Create a spaCy project file for your project.\n",
318+
"from util.project import make_project\n"
319+
]
320+
},
321+
{
322+
"cell_type": "code",
323+
"execution_count": null,
324+
"metadata": {
325+
"id": "B7mv_5zp8pne",
326+
"outputId": "78aef28a-b8af-424f-ed80-c034056737a1"
327+
},
328+
"outputs": [
329+
{
330+
"name": "stdout",
331+
"output_type": "stream",
332+
"text": [
333+
"created file /home/ajanco/projects/cadet-the-notebook/Meow.zip\n"
334+
]
335+
}
336+
],
337+
"source": [
338+
"import shutil\n",
339+
"from util.project import make_project\n",
340+
"\n",
341+
"new_lang = Path.cwd() / \"new_lang\"\n",
342+
"make_project(lang_name,lang_code)\n",
343+
"\n",
344+
"#make export directory\n",
345+
"export_path = Path.cwd() / lang_name\n",
346+
"\n",
347+
"\n",
348+
"#shutil.make_archive(\"zipped_sample_directory\", \"zip\", \"sample_directory\")\n",
349+
"shutil.make_archive(str(export_path), 'zip', str(new_lang))\n",
350+
"zip_file = Path.cwd() / (lang_name + '.zip')\n",
351+
"print(f'created file {zip_file}')"
352+
]
353+
}
354+
],
355+
"metadata": {
356+
"kernelspec": {
357+
"display_name": "Cadet Notebook",
358+
"language": "python",
359+
"name": "cadet-notebook"
360+
},
361+
"language_info": {
362+
"codemirror_mode": {
363+
"name": "ipython",
364+
"version": 3
365+
},
366+
"file_extension": ".py",
367+
"mimetype": "text/x-python",
368+
"name": "python",
369+
"nbconvert_exporter": "python",
370+
"pygments_lexer": "ipython3",
371+
"version": "3.9.5"
372+
},
373+
"colab": {
374+
"provenance": []
375+
}
376+
},
377+
"nbformat": 4,
378+
"nbformat_minor": 0
379+
}

0 commit comments

Comments
 (0)