Create thai-tokenizer.ipynb

PyThaiNLP · Apr 8, 2024 · a741d86 · a741d86
1 parent 5caebe1
commit a741d86
Showing 1 changed file with 226 additions and 0 deletions.
diff --git a/source/notebooks/thai-tokenizer.ipynb b/source/notebooks/thai-tokenizer.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ตัดคำภาษาไทยด้วย PyThaiNLP\n",
+    "\n",
+    "PyThaiNLP รองรับการตัดคำทั้งแบบใช้พจนานุกรม และ deep learning โดยค่าเริ่มต้นของ PyThaiNLP ทำงานโดยใช้พจนานุกรมตัดคำด้วยตัวตัดคำที่ชื่อว่า newmm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -q pythainlp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pythainlp.tokenize import word_tokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"ก็จะรู้ความชั่วร้ายที่ทำไว้     และคงจะไม่ยอมให้ทำนาบนหลังคน \""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ค่าเริ่มต้น (newmm):\n",
+      "['ก็', 'จะ', 'รู้ความ', 'ชั่วร้าย', 'ที่', 'ทำ', 'ไว้', '     ', 'และ', 'คงจะ', 'ไม่', 'ยอมให้', 'ทำนาบนหลังคน', ' ']\n",
+      "\n",
+      "newmm และ keep_whitespace=False:\n",
+      "['ก็', 'จะ', 'รู้ความ', 'ชั่วร้าย', 'ที่', 'ทำ', 'ไว้', 'และ', 'คงจะ', 'ไม่', 'ยอมให้', 'ทำนาบนหลังคน']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"ค่าเริ่มต้น (newmm):\")\n",
+    "print(word_tokenize(text))\n",
+    "print(\"\\nnewmm และ keep_whitespace=False:\")\n",
+    "print(word_tokenize(text, keep_whitespace=False)) # ถ้าเป็น False จะไม่เก็บช่องว่างหลังการตัดคำไว้"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "นอกจากนั้นคุณสามารถเลือกตัวตัดคำอื่น ๆ ได้ เช่น longest เป็นต้น คุณสามารถอ่านตัวตัดคำที่รองรับได้จากเอกสาร API ของ PyThaiNLP ตามรุ่นของ PyThaiNLPที่คุณเรียกใช้งาน ได้ที่ https://pythainlp.github.io/docs/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "newmm  : ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศ', 'ใช้แล้ว']\n",
+      "longest: ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศใช้', 'แล้ว']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pythainlp.tokenize import word_tokenize\n",
+    "\n",
+    "text = \"กฎหมายแรงงานฉบับปรับปรุงใหม่ประกาศใช้แล้ว\"\n",
+    "\n",
+    "print(\"newmm  :\", word_tokenize(text))  # default engine is \"newmm\"\n",
+    "print(\"longest:\", word_tokenize(text, engine=\"longest\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ปรับแต่งพจนานุกรมตัดคำภาษาไทยใน PyThaiNLP\n",
+    "\n",
+    "หากคุณณต้องการปรับแต่งตัวตัดคำให้เพิ่มคำศัพท์ตามที่คุณต้องการ แก้ไข หรือลบคำศัพท์ คุณสามารถทำได้ดังนี้"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "newmm (custom dictionary): ['กฎหมาย', 'แรงงาน', 'ฉบับปรับปรุงใหม่ประกาศใช้แล้ว', ' ', 'พายไทยก็พร้อมปรับปรุง']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pythainlp.tokenize import Tokenizer\n",
+    "\n",
+    "text = \"กฎหมายแรงงานฉบับปรับปรุงใหม่ประกาศใช้แล้ว พายไทยก็พร้อมปรับปรุง\"\n",
+    "\n",
+    "words = [\"แรงงาน\"]  # รายการคำศัพท์ ให้มีเพียงคำเดียว\n",
+    "custom_tokenizer = Tokenizer(words)\n",
+    "print(\"newmm (custom dictionary):\", custom_tokenizer.word_tokenize(text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### เรียกใช้พจนานุกรมของ PyThaiNLP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pythainlp.corpus import thai_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "newmm : ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศ', 'ใช้แล้ว', ' ', 'พาย', 'ไทย', 'ก็', 'พร้อม', 'ปรับปรุง']\n"
+     ]
+    }
+   ],
+   "source": [
+    "list_words = list(thai_words())\n",
+    "custom_tokenizer = Tokenizer(list_words)\n",
+    "print(\"newmm :\", custom_tokenizer.word_tokenize(text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "newmm  (custom dictionary): ['กฎหมายแรงงาน', 'ฉบับ', 'ปรับปรุง', 'ใหม่', 'ประกาศใช้', 'แล้ว', ' ', 'พายไทย', 'ก็', 'พร้อม', 'ปรับปรุง']\n"
+     ]
+    }
+   ],
+   "source": [
+    "list_words = list(thai_words())\n",
+    "list_words.append(\"พายไทย\")  # เพิ่ม \"พายไทย\"\n",
+    "list_words.remove(\"ใช้แล้ว\")  # ลบ \"ใช้แล้ว\"\n",
+    "custom_tokenizer = Tokenizer(list_words)\n",
+    "print(\"newmm  (custom dictionary):\", custom_tokenizer.word_tokenize(text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# more\n",
+    "# WIP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.13 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}