diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3ac9f68 --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +.vscode/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/README.md b/README.md index 85c7099..11d7b19 100644 --- a/README.md +++ b/README.md @@ -257,7 +257,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi from bnlp import POS bn_pos = POS() model_path = "model/bn_pos.pkl" - text = "আমি ভাত খাই।" + text = "আমি ভাত খাই।" # or you can pass ['আমি', 'ভাত', 'খাই', '।'] res = bn_pos.tag(model_path, text) print(res) # [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')] @@ -269,9 +269,11 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi from bnlp import POS bn_pos = POS() model_name = "pos_model.pkl" - tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]] + train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]] - bn_pos.train(model_name, tagged_sentences) + test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]] + + bn_pos.train(model_name, train_data, test_data) ``` @@ -285,7 +287,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi from bnlp import NER bn_ner = NER() model_path = "model/bn_ner.pkl" - text = "সে ঢাকায় থাকে।" + text = "সে ঢাকায় থাকে।" # or you can pass ['সে', 'ঢাকায়', 'থাকে', '।'] result = bn_ner.tag(model_path, text) print(result) # [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')] @@ -297,9 +299,11 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi from bnlp import NER bn_ner = NER() model_name = "ner_model.pkl" - tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]] + train_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]] + + test_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]] - bn_ner.train(model_name, tagged_sentences) + bn_ner.train(model_name, train_data, test_data) ``` diff --git a/bnlp/__init__.py b/bnlp/__init__.py index 4bc55ec..b5fdc01 100644 --- a/bnlp/__init__.py +++ b/bnlp/__init__.py @@ -1,4 +1,4 @@ -__version__="3.1.1" +__version__="3.1.2" import os diff --git a/bnlp/ner.py b/bnlp/ner.py index 2016b7b..928a455 100644 --- a/bnlp/ner.py +++ b/bnlp/ner.py @@ -52,22 +52,21 @@ def tag(self, model_path, text): punctuations = string.punctuation+'।' with open(model_path, 'rb') as pkl_model: model = pickle.load(pkl_model) - basic_t = BasicTokenizer() - tokens = basic_t.tokenize(text) - tokens = [x for x in tokens if x not in punctuations] + if not isinstance(text, list): + basic_t = BasicTokenizer() + tokens = basic_t.tokenize(text) + tokens = [x for x in tokens if x not in punctuations] + else: + tokens = text sentence_features = [features(tokens, index) for index in range(len(tokens))] result = list(zip(tokens, model.predict([sentence_features])[0])) pkl_model.close() return result - def train(self, model_name, tagged_sentences): - # Split the dataset for training and testing - cutoff = int(.75 * len(tagged_sentences)) - training_sentences = tagged_sentences[:cutoff] - test_sentences = tagged_sentences[cutoff:] - - X_train, y_train = transform_to_dataset(training_sentences) - X_test, y_test = transform_to_dataset(test_sentences) + def train(self, model_name, train_data, test_data, average="micro"): + + X_train, y_train = transform_to_dataset(train_data) + X_test, y_test = transform_to_dataset(test_data) print(len(X_train)) print(len(X_test)) @@ -82,6 +81,8 @@ def train(self, model_name, tagged_sentences): y_pred = model.predict(X_test) print("Accuracy is: ") print(metrics.flat_accuracy_score(y_test, y_pred)) + print(f"F1 Score({average}) is: ") + print(metrics.flat_f1_score(y_test, y_pred, average=average)) pickle.dump(model, open(model_name, 'wb')) print("Model Saved!") diff --git a/bnlp/pos.py b/bnlp/pos.py index 5af4f83..08984d7 100644 --- a/bnlp/pos.py +++ b/bnlp/pos.py @@ -47,21 +47,20 @@ class POS: def tag(self, model_path, text): with open(model_path, 'rb') as pkl_model: model = pickle.load(pkl_model) - basic_t = BasicTokenizer() - tokens = basic_t.tokenize(text) + if not isinstance(text, list): + basic_t = BasicTokenizer() + tokens = basic_t.tokenize(text) + else: + tokens = text sentence_features = [features(tokens, index) for index in range(len(tokens))] result = list(zip(tokens, model.predict([sentence_features])[0])) pkl_model.close() return result - def train(self, model_name, tagged_sentences): - # Split the dataset for training and testing - cutoff = int(.75 * len(tagged_sentences)) - training_sentences = tagged_sentences[:cutoff] - test_sentences = tagged_sentences[cutoff:] - - X_train, y_train = transform_to_dataset(training_sentences) - X_test, y_test = transform_to_dataset(test_sentences) + def train(self, model_name, train_data, test_data, average="micro"): + + X_train, y_train = transform_to_dataset(train_data) + X_test, y_test = transform_to_dataset(test_data) print(len(X_train)) print(len(X_test)) @@ -76,6 +75,9 @@ def train(self, model_name, tagged_sentences): y_pred = model.predict(X_test) print("Accuracy is: ") print(metrics.flat_accuracy_score(y_test, y_pred)) + + print(f"F1 Score({average}) is: ") + print(metrics.flat_f1_score(y_test, y_pred, average=average)) pickle.dump(model, open(model_name, 'wb')) print("Model Saved!") diff --git a/docs/index.rst b/docs/index.rst index a27cfe2..804b938 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -289,7 +289,7 @@ Bengali POS Tagging from bnlp import POS bn_pos = POS() model_path = "model/bn_pos_model.pkl" - text = "আমি ভাত খাই।" + text = "আমি ভাত খাই।" # or you can pass token list res = bn_pos.tag(model_path, text) print(res) # [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')] @@ -302,9 +302,10 @@ Bengali POS Tagging from bnlp import POS bn_pos = POS() model_name = "pos_model.pkl" - tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]] + train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]] + test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]] - bn_pos.train(model_name, tagged_sentences) + bn_pos.train(model_name, train_data, test_data) Bengali NER @@ -322,7 +323,7 @@ Bengali NER from bnlp import ner bn_ner = NER() model_path = "model/bn_pos_model.pkl" - text = "সে ঢাকায় থাকে।" + text = "সে ঢাকায় থাকে।" # or you can pass token list res = bn_ner.tag(model_path, text) print(res) # [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')] @@ -335,9 +336,10 @@ Bengali NER from bnlp import NER bn_ner = NER() model_name = "ner_model.pkl" - tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]] + train_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]] + test_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]] - bn_ner.train(model_name, tagged_sentences) + bn_ner.train(model_name, train_data, test_data) diff --git a/notebook/bnlp_colab_training.ipynb b/notebook/bnlp_colab_training.ipynb index 79684f4..af5ff14 100644 --- a/notebook/bnlp_colab_training.ipynb +++ b/notebook/bnlp_colab_training.ipynb @@ -1,6 +1,6 @@ { "nbformat": 4, - "nbformat_minor": 0, + "nbformat_minor": 2, "metadata": { "colab": { "name": "bnlp_colab_training.ipynb", @@ -18,53 +18,46 @@ "cells": [ { "cell_type": "markdown", + "source": [ + "\"Open" + ], "metadata": { "id": "view-in-github", "colab_type": "text" - }, - "source": [ - "\"Open" - ] + } }, { "cell_type": "markdown", - "metadata": { - "id": "0SQ0x9bh9QsL" - }, "source": [ "# BNLP\n", "\n", "BNLP is a natural language processing toolkit for Bengali Language. This tool will help you to tokenize Bengali text, Embedding Bengali words, Bengali POS Tagging, Construct Neural Model for Bengali NLP purposes.\n", "\n", "Here we are prodiving training approach of different model using **BNLP**" - ] + ], + "metadata": { + "id": "0SQ0x9bh9QsL" + } }, { "cell_type": "markdown", - "metadata": { - "id": "MuT4uyIf5-Gy" - }, "source": [ "## Installation" - ] + ], + "metadata": { + "id": "MuT4uyIf5-Gy" + } }, { "cell_type": "code", - "metadata": { - "id": "KJN642aj5nVc", - "outputId": "20f88496-2e42-47e1-b70d-e4dca8037351", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 462 - } - }, + "execution_count": 1, "source": [ "!pip install -U bnlp_toolkit" ], - "execution_count": 1, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "Collecting bnlp_toolkit\n", " Downloading https://files.pythonhosted.org/packages/16/be/44d78b55ad8121cce1ca0bdbc7cf1db8d3f585006bacb08bd53ec8653957/bnlp_toolkit-3.0.0-py3-none-any.whl\n", @@ -91,30 +84,30 @@ "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim->bnlp_toolkit) (1.24.3)\n", "Installing collected packages: python-crfsuite, sklearn-crfsuite, sentencepiece, bnlp-toolkit\n", "Successfully installed bnlp-toolkit-3.0.0 python-crfsuite-0.9.7 sentencepiece-0.1.91 sklearn-crfsuite-0.3.6\n" - ], - "name": "stdout" + ] } - ] + ], + "metadata": { + "id": "KJN642aj5nVc", + "outputId": "20f88496-2e42-47e1-b70d-e4dca8037351", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 462 + } + } }, { "cell_type": "markdown", - "metadata": { - "id": "IWy0qUdy6BY3" - }, "source": [ "## Downloading Bengali Processed Wikipedia Data " - ] + ], + "metadata": { + "id": "IWy0qUdy6BY3" + } }, { "cell_type": "code", - "metadata": { - "id": "AcwFE8le5yTF", - "outputId": "69cad5d1-3917-4376-bc81-ea1340cfd240", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 51 - } - }, + "execution_count": 2, "source": [ "#drive data download code\n", "!pip install -U -q PyDrive\n", @@ -132,36 +125,40 @@ "!unzip bn_wiki_data.txt.zip\n", "!rm -rf bn_wiki_data.txt.zip" ], - "execution_count": 2, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "Archive: bn_wiki_data.txt.zip\n", " inflating: bn_wiki_data.txt \n" - ], - "name": "stdout" + ] + } + ], + "metadata": { + "id": "AcwFE8le5yTF", + "outputId": "69cad5d1-3917-4376-bc81-ea1340cfd240", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 } - ] + } }, { "cell_type": "markdown", - "metadata": { - "id": "350KPo4D6Z4o" - }, "source": [ "## Training\n", "\n", "Here we present `bengali sentencepiece`, `bengali word2vec`, `bengali fasttext` training on `bengali wikipedia data`\n", "\n", "Training time will depend on data size." - ] + ], + "metadata": { + "id": "350KPo4D6Z4o" + } }, { "cell_type": "markdown", - "metadata": { - "id": "I_wHJFOW6dlo" - }, "source": [ "### Training Bengali Sentencepice Model\n", "\n", @@ -169,18 +166,14 @@ "\n", "* `wiki_sp.model` \n", "* `wiki_sp.vecab`" - ] + ], + "metadata": { + "id": "I_wHJFOW6dlo" + } }, { "cell_type": "code", - "metadata": { - "id": "8l7DUWI66MD4", - "outputId": "d7710e45-6981-432e-96fb-9ec2b9c159ab", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 85 - } - }, + "execution_count": 3, "source": [ "from bnlp import SentencepieceTokenizer\n", "\n", @@ -190,25 +183,29 @@ "vocab_size = 30000\n", "bsp.train(data, model_prefix, vocab_size) " ], - "execution_count": 3, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "punkt not found. downloading...\n", "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n", "wiki_sp.model and wiki_sp.vocab is saved on your current directory\n" - ], - "name": "stdout" + ] + } + ], + "metadata": { + "id": "8l7DUWI66MD4", + "outputId": "d7710e45-6981-432e-96fb-9ec2b9c159ab", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 85 } - ] + } }, { "cell_type": "markdown", - "metadata": { - "id": "k-k4Dszo61v2" - }, "source": [ "### Training Bengali Word2Vec Model\n", "\n", @@ -218,18 +215,14 @@ "* `wiki_word2vec.vector`\n", "* `wiki_word2vec.model.trainables.syn1neg.npy`\n", "* `wiki_word2vec..model.wv.vectors.npy`\n" - ] + ], + "metadata": { + "id": "k-k4Dszo61v2" + } }, { "cell_type": "code", - "metadata": { - "id": "OphHV5Yp60KW", - "outputId": "7ce2a259-6339-494d-e023-5ffbe787c774", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 88 - } - }, + "execution_count": 4, "source": [ "from bnlp import BengaliWord2Vec\n", "bwv = BengaliWord2Vec()\n", @@ -238,37 +231,42 @@ "vector_name = \"wiki_word2vec.vector\"\n", "bwv.train(data_file, model_name, vector_name)" ], - "execution_count": 4, "outputs": [ { "output_type": "stream", + "name": "stderr", "text": [ "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:252: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" - ], - "name": "stderr" + ] }, { "output_type": "stream", + "name": "stdout", "text": [ "wiki_word2vec.model and wiki_word2vec.vector saved in your current directory.\n" - ], - "name": "stdout" + ] + } + ], + "metadata": { + "id": "OphHV5Yp60KW", + "outputId": "7ce2a259-6339-494d-e023-5ffbe787c774", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 88 } - ] + } }, { + "cell_type": "markdown", "source": [ "### Pre-training or resume Bengali word2vec training" ], - "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "from bnlp import BengaliWord2Vec\n", "bwv = BengaliWord2Vec()\n", @@ -278,38 +276,33 @@ "model_name = \"test_model.model\"\n", "vector_name = \"test_vector.vector\"\n", "bwv.pretrain(trained_model_path, data_file, model_name, vector_name, epochs=5)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": { - "id": "TAMgr4WT8x2a" - }, "source": [ "### Training Bengali Fasttext Model\n", "First of all install `fasttext` using `pip install fasttext` and restart runtime.\n", "\n", "After successfully training it will produce: \n", "* `wiki_fasttext.bin` " - ] + ], + "metadata": { + "id": "TAMgr4WT8x2a" + } }, { "cell_type": "code", - "metadata": { - "id": "JXptOhxg4s6r", - "outputId": "a9386ef0-032c-437e-c416-e34bce2b792e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 258 - } - }, + "execution_count": 5, "source": [ "!pip install fasttext" ], - "execution_count": 5, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "Collecting fasttext\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)\n", @@ -324,16 +317,21 @@ "Successfully built fasttext\n", "Installing collected packages: fasttext\n", "Successfully installed fasttext-0.9.2\n" - ], - "name": "stdout" + ] } - ] + ], + "metadata": { + "id": "JXptOhxg4s6r", + "outputId": "a9386ef0-032c-437e-c416-e34bce2b792e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + } + } }, { "cell_type": "code", - "metadata": { - "id": "F67Yzdu08xBd" - }, + "execution_count": 1, "source": [ "from bnlp.embedding.fasttext import BengaliFasttext\n", "\n", @@ -343,44 +341,40 @@ "epoch = 1\n", "bft.train(data, model_name, epoch)" ], - "execution_count": 1, - "outputs": [] + "outputs": [], + "metadata": { + "id": "F67Yzdu08xBd" + } }, { "cell_type": "markdown", - "metadata": { - "id": "ZtsLVmOs9lgG" - }, "source": [ "### Training Bengali POS TAGGING CRF model\n", "\n", "After successfully training it will produce a trained model with accuracy on evaluation data: \n", "\n", "* `pos_model.pkl`" - ] + ], + "metadata": { + "id": "ZtsLVmOs9lgG" + } }, { "cell_type": "code", - "metadata": { - "id": "VUKhbkaBE-CV", - "outputId": "e3cd7857-9fec-42dc-d2c2-f037c3ab55f5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 170 - } - }, + "execution_count": 2, "source": [ "from bnlp import POS\n", "bn_pos = POS()\n", "model_name = \"pos_model.pkl\"\n", - "tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]\n", + "train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]\n", + "test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]\n", "\n", - "bn_pos.train(model_name, tagged_sentences)" + "bn_pos.train(model_name, train_data, test_data)" ], - "execution_count": 2, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "1\n", "1\n", @@ -391,45 +385,46 @@ "Accuracy is: \n", "0.1111111111111111\n", "Model Saved!\n" - ], - "name": "stdout" + ] } - ] + ], + "metadata": { + "id": "VUKhbkaBE-CV", + "outputId": "e3cd7857-9fec-42dc-d2c2-f037c3ab55f5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 170 + } + } }, { "cell_type": "markdown", - "metadata": { - "id": "dPB7SBrKuSna" - }, "source": [ "## Training Bengali NER model\n", "After successfully training it will produce a trained model with accuracy on evaluation data:\n", "\n", "* `ner_model.pkl` " - ] + ], + "metadata": { + "id": "dPB7SBrKuSna" + } }, { "cell_type": "code", - "metadata": { - "id": "of_1lkdW917n", - "outputId": "b3d54074-cda1-46d9-b4f9-800c50e4ef18", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 170 - } - }, + "execution_count": 3, "source": [ "from bnlp import NER\n", "bn_ner = NER()\n", "model_name = \"ner_model.pkl\"\n", - "tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]\n", + "train_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]\n", + "test_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]\n", "\n", - "bn_ner.train(model_name, tagged_sentences)" + "bn_ner.train(model_name, train_data, test_data)" ], - "execution_count": 3, "outputs": [ { "output_type": "stream", + "name": "stdout", "text": [ "2\n", "1\n", @@ -440,19 +435,26 @@ "Accuracy is: \n", "1.0\n", "Model Saved!\n" - ], - "name": "stdout" + ] } - ] + ], + "metadata": { + "id": "of_1lkdW917n", + "outputId": "b3d54074-cda1-46d9-b4f9-800c50e4ef18", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 170 + } + } }, { "cell_type": "code", + "execution_count": null, + "source": [], + "outputs": [], "metadata": { "id": "qVrYxT5DulwP" - }, - "source": [], - "execution_count": null, - "outputs": [] + } } ] } \ No newline at end of file diff --git a/setup.py b/setup.py index f8d17c5..c04b7f1 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setuptools.setup( name="bnlp_toolkit", - version="3.1.1", + version="3.1.2", author="Sagor Sarker", author_email="sagorhem3532@gmail.com", description="BNLP is a natural language processing toolkit for Bengali Language",