Merge branch 'update_ner_pos'

sagorbrur · Sep 11, 2021 · 38c38d9 · 38c38d9
2 parents 5ec8e4f + 4ce8199
commit 38c38d9
Show file tree

Hide file tree

Showing 8 changed files with 329 additions and 179 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,139 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+.vscode/
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/README.md b/README.md
@@ -257,7 +257,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     from bnlp import POS
     bn_pos = POS()
     model_path = "model/bn_pos.pkl"
-    text = "আমি ভাত খাই।"
+    text = "আমি ভাত খাই।" # or you can pass ['আমি', 'ভাত', 'খাই', '।']
     res = bn_pos.tag(model_path, text)
     print(res)
     # [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')]
@@ -269,9 +269,11 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     from bnlp import POS
     bn_pos = POS()
     model_name = "pos_model.pkl"
-    tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
+    train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
 
-    bn_pos.train(model_name, tagged_sentences)
+    test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
+
+    bn_pos.train(model_name, train_data, test_data)
 
     ```
 
@@ -285,7 +287,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     from bnlp import NER
     bn_ner = NER()
     model_path = "model/bn_ner.pkl"
-    text = "সে ঢাকায় থাকে।"
+    text = "সে ঢাকায় থাকে।" # or you can pass ['সে', 'ঢাকায়', 'থাকে', '।']
     result = bn_ner.tag(model_path, text)
     print(result)
     # [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')]
@@ -297,9 +299,11 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     from bnlp import NER
     bn_ner = NER()
     model_name = "ner_model.pkl"
-    tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
+    train_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
+
+    test_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
 
-    bn_ner.train(model_name, tagged_sentences)
+    bn_ner.train(model_name, train_data, test_data)
 
     ```
 

diff --git a/bnlp/__init__.py b/bnlp/__init__.py
@@ -1,4 +1,4 @@
-__version__="3.1.1"
+__version__="3.1.2"
 
 
 import os

diff --git a/bnlp/ner.py b/bnlp/ner.py
@@ -52,22 +52,21 @@ def tag(self, model_path, text):
         punctuations = string.punctuation+'।'
         with open(model_path, 'rb') as pkl_model:
             model = pickle.load(pkl_model)
-            basic_t = BasicTokenizer()
-            tokens = basic_t.tokenize(text)
-            tokens = [x for x in tokens if x not in punctuations]
+            if not isinstance(text, list):
+                basic_t = BasicTokenizer()
+                tokens = basic_t.tokenize(text)
+                tokens = [x for x in tokens if x not in punctuations]
+            else:
+                tokens = text
             sentence_features = [features(tokens, index) for index in range(len(tokens))]
             result = list(zip(tokens, model.predict([sentence_features])[0]))
             pkl_model.close()
             return result
 
-    def train(self, model_name, tagged_sentences):
-        # Split the dataset for training and testing
-        cutoff = int(.75 * len(tagged_sentences))
-        training_sentences = tagged_sentences[:cutoff]
-        test_sentences = tagged_sentences[cutoff:]
-
-        X_train, y_train = transform_to_dataset(training_sentences)
-        X_test, y_test = transform_to_dataset(test_sentences)
+    def train(self, model_name, train_data, test_data, average="micro"):
+
+        X_train, y_train = transform_to_dataset(train_data)
+        X_test, y_test = transform_to_dataset(test_data)
         print(len(X_train))
         print(len(X_test))
 
@@ -82,6 +81,8 @@ def train(self, model_name, tagged_sentences):
         y_pred = model.predict(X_test)
         print("Accuracy is: ")
         print(metrics.flat_accuracy_score(y_test, y_pred))
+        print(f"F1 Score({average}) is: ")
+        print(metrics.flat_f1_score(y_test, y_pred, average=average))
 
         pickle.dump(model, open(model_name, 'wb'))
         print("Model Saved!")
diff --git a/bnlp/pos.py b/bnlp/pos.py
@@ -47,21 +47,20 @@ class POS:
     def tag(self, model_path, text):
         with open(model_path, 'rb') as pkl_model:
             model = pickle.load(pkl_model)
-            basic_t = BasicTokenizer()
-            tokens = basic_t.tokenize(text)
+            if not isinstance(text, list):
+                basic_t = BasicTokenizer()
+                tokens = basic_t.tokenize(text)
+            else:
+                tokens = text
             sentence_features = [features(tokens, index) for index in range(len(tokens))]
             result = list(zip(tokens, model.predict([sentence_features])[0]))
             pkl_model.close()
             return result
 
-    def train(self, model_name, tagged_sentences):
-        # Split the dataset for training and testing
-        cutoff = int(.75 * len(tagged_sentences))
-        training_sentences = tagged_sentences[:cutoff]
-        test_sentences = tagged_sentences[cutoff:]
-
-        X_train, y_train = transform_to_dataset(training_sentences)
-        X_test, y_test = transform_to_dataset(test_sentences)
+    def train(self, model_name, train_data, test_data, average="micro"):
+
+        X_train, y_train = transform_to_dataset(train_data)
+        X_test, y_test = transform_to_dataset(test_data)
         print(len(X_train))
         print(len(X_test))
 
@@ -76,6 +75,9 @@ def train(self, model_name, tagged_sentences):
         y_pred = model.predict(X_test)
         print("Accuracy is: ")
         print(metrics.flat_accuracy_score(y_test, y_pred))
+
+        print(f"F1 Score({average}) is: ")
+        print(metrics.flat_f1_score(y_test, y_pred, average=average))
 
         pickle.dump(model, open(model_name, 'wb'))
         print("Model Saved!")
diff --git a/docs/index.rst b/docs/index.rst
@@ -289,7 +289,7 @@ Bengali POS Tagging
      from bnlp import POS
      bn_pos = POS()
      model_path = "model/bn_pos_model.pkl"
-     text = "আমি ভাত খাই।"
+     text = "আমি ভাত খাই।" # or you can pass token list
      res = bn_pos.tag(model_path, text)
      print(res)
      # [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')]
@@ -302,9 +302,10 @@ Bengali POS Tagging
      from bnlp import POS
      bn_pos = POS()
      model_name = "pos_model.pkl"
-     tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
+     train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
+     test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
 
-     bn_pos.train(model_name, tagged_sentences)
+     bn_pos.train(model_name, train_data, test_data)
 
 
 Bengali NER
@@ -322,7 +323,7 @@ Bengali NER
      from bnlp import ner
      bn_ner = NER()
      model_path = "model/bn_pos_model.pkl"
-     text = "সে ঢাকায় থাকে।"
+     text = "সে ঢাকায় থাকে।" # or you can pass token list
      res = bn_ner.tag(model_path, text)
      print(res)
      # [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')]
@@ -335,9 +336,10 @@ Bengali NER
      from bnlp import NER
      bn_ner = NER()
      model_name = "ner_model.pkl"
-     tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
+     train_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
+     test_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
 
-     bn_ner.train(model_name, tagged_sentences)
+     bn_ner.train(model_name, train_data, test_data)