Skip to content

Commit

Permalink
Merge branch 'update_ner_pos'
Browse files Browse the repository at this point in the history
  • Loading branch information
sagorbrur committed Sep 11, 2021
2 parents 5ec8e4f + 4ce8199 commit 38c38d9
Show file tree
Hide file tree
Showing 8 changed files with 329 additions and 179 deletions.
139 changes: 139 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
.vscode/

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
from bnlp import POS
bn_pos = POS()
model_path = "model/bn_pos.pkl"
text = "আমি ভাত খাই।"
text = "আমি ভাত খাই।" # or you can pass ['আমি', 'ভাত', 'খাই', '।']
res = bn_pos.tag(model_path, text)
print(res)
# [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')]
Expand All @@ -269,9 +269,11 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
from bnlp import POS
bn_pos = POS()
model_name = "pos_model.pkl"
tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('', 'PU')]]
train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('', 'PU')]]

bn_pos.train(model_name, tagged_sentences)
test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('', 'PU')]]

bn_pos.train(model_name, train_data, test_data)

```

Expand All @@ -285,7 +287,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
from bnlp import NER
bn_ner = NER()
model_path = "model/bn_ner.pkl"
text = "সে ঢাকায় থাকে।"
text = "সে ঢাকায় থাকে।" # or you can pass ['সে', 'ঢাকায়', 'থাকে', '।']
result = bn_ner.tag(model_path, text)
print(result)
# [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')]
Expand All @@ -297,9 +299,11 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
from bnlp import NER
bn_ner = NER()
model_name = "ner_model.pkl"
tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
train_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]

test_data = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]

bn_ner.train(model_name, tagged_sentences)
bn_ner.train(model_name, train_data, test_data)

```

Expand Down
2 changes: 1 addition & 1 deletion bnlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__="3.1.1"
__version__="3.1.2"


import os
Expand Down
23 changes: 12 additions & 11 deletions bnlp/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,21 @@ def tag(self, model_path, text):
punctuations = string.punctuation+'।'
with open(model_path, 'rb') as pkl_model:
model = pickle.load(pkl_model)
basic_t = BasicTokenizer()
tokens = basic_t.tokenize(text)
tokens = [x for x in tokens if x not in punctuations]
if not isinstance(text, list):
basic_t = BasicTokenizer()
tokens = basic_t.tokenize(text)
tokens = [x for x in tokens if x not in punctuations]
else:
tokens = text
sentence_features = [features(tokens, index) for index in range(len(tokens))]
result = list(zip(tokens, model.predict([sentence_features])[0]))
pkl_model.close()
return result

def train(self, model_name, tagged_sentences):
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)
def train(self, model_name, train_data, test_data, average="micro"):

X_train, y_train = transform_to_dataset(train_data)
X_test, y_test = transform_to_dataset(test_data)
print(len(X_train))
print(len(X_test))

Expand All @@ -82,6 +81,8 @@ def train(self, model_name, tagged_sentences):
y_pred = model.predict(X_test)
print("Accuracy is: ")
print(metrics.flat_accuracy_score(y_test, y_pred))
print(f"F1 Score({average}) is: ")
print(metrics.flat_f1_score(y_test, y_pred, average=average))

pickle.dump(model, open(model_name, 'wb'))
print("Model Saved!")
22 changes: 12 additions & 10 deletions bnlp/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,21 +47,20 @@ class POS:
def tag(self, model_path, text):
with open(model_path, 'rb') as pkl_model:
model = pickle.load(pkl_model)
basic_t = BasicTokenizer()
tokens = basic_t.tokenize(text)
if not isinstance(text, list):
basic_t = BasicTokenizer()
tokens = basic_t.tokenize(text)
else:
tokens = text
sentence_features = [features(tokens, index) for index in range(len(tokens))]
result = list(zip(tokens, model.predict([sentence_features])[0]))
pkl_model.close()
return result

def train(self, model_name, tagged_sentences):
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)
def train(self, model_name, train_data, test_data, average="micro"):

X_train, y_train = transform_to_dataset(train_data)
X_test, y_test = transform_to_dataset(test_data)
print(len(X_train))
print(len(X_test))

Expand All @@ -76,6 +75,9 @@ def train(self, model_name, tagged_sentences):
y_pred = model.predict(X_test)
print("Accuracy is: ")
print(metrics.flat_accuracy_score(y_test, y_pred))

print(f"F1 Score({average}) is: ")
print(metrics.flat_f1_score(y_test, y_pred, average=average))

pickle.dump(model, open(model_name, 'wb'))
print("Model Saved!")
14 changes: 8 additions & 6 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ Bengali POS Tagging
from bnlp import POS
bn_pos = POS()
model_path = "model/bn_pos_model.pkl"
text = "আমি ভাত খাই।"
text = "আমি ভাত খাই।" # or you can pass token list
res = bn_pos.tag(model_path, text)
print(res)
# [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')]
Expand All @@ -302,9 +302,10 @@ Bengali POS Tagging
from bnlp import POS
bn_pos = POS()
model_name = "pos_model.pkl"
tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('', 'PU')]]
train_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('', 'PU')]]
test_data = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('', 'PU')]]
bn_pos.train(model_name, tagged_sentences)
bn_pos.train(model_name, train_data, test_data)
Bengali NER
Expand All @@ -322,7 +323,7 @@ Bengali NER
from bnlp import ner
bn_ner = NER()
model_path = "model/bn_pos_model.pkl"
text = "সে ঢাকায় থাকে।"
text = "সে ঢাকায় থাকে।" # or you can pass token list
res = bn_ner.tag(model_path, text)
print(res)
# [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')]
Expand All @@ -335,9 +336,10 @@ Bengali NER
from bnlp import NER
bn_ner = NER()
model_name = "ner_model.pkl"
tagged_sentences = [[('ত্রাণ', 'O'),('', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
train_data = [[('ত্রাণ', 'O'),('', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
test_data = [[('ত্রাণ', 'O'),('', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
bn_ner.train(model_name, tagged_sentences)
bn_ner.train(model_name, train_data, test_data)
Expand Down
Loading

0 comments on commit 38c38d9

Please sign in to comment.