Merge pull request #15 from sagorbrur/dev

bnlp 3.0.0 final release
sagorbrur · Oct 20, 2020 · 242dee5 · 242dee5
2 parents 14cbf42 + a4f0946
commit 242dee5
Show file tree

Hide file tree

Showing 26 changed files with 236 additions and 298 deletions.
diff --git a/README.md b/README.md
@@ -13,42 +13,14 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 
 **NB: Any Researcher who refer this tool in his/her paper please let us know, we will include paper link here**</br>
 
-**BNLP 3.0 dev build successfully**. Check it out [here](https://pypi.org/project/bnlp-toolkit/3.0.0.dev3/)
-
-# Contents
-- [Current Features](#current-features)
-- [Installation](#installation)
-- [Pretrained Model](#pretrained-model)
-- [Tokenization](#tokenization)
-- [Embedding](#word-embedding)
-- [POS Tagging](#bengali-pos-tagging)
-- [NER](#bengali-ner)
-- [Issue](#issue)
-- [Contributor Guide](#contributor-guide)
-- [Contributor List](#contributor-list)
-- [Documentation](https://bnlp.readthedocs.io/en/latest/)
-- [Notebook](https://github.com/sagorbrur/bnlp/tree/master/notebook)
-
-
-## Current Features
-* [Bengali Tokenization](#tokenization)
-  - SentencePiece Tokenizer
-  - Basic Tokenizer
-  - NLTK Tokenizer
-* [Bengali Word Embedding](#word-embedding)
-  - Bengali Word2Vec
-  - Bengali Fasttext
-  - Bengali GloVe
-
-* [Bengali POS Tagging](#bengali-pos-tagging)
-* [Bengali Name Entity Recognition](#bengali-ner)
-
 
 ## Installation
 
 ### PIP installer(python 3.5, 3.6, 3.7 tested okay)
 
-  ```pip install bnlp_toolkit```
+  ```
+  pip install bnlp_toolkit
+  ```
 
 ### Local Installer
   ```
@@ -83,44 +55,15 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 
 ## Tokenization
 
-* **Bengali SentencePiece Tokenization**
-
-  - tokenization using trained model
-    ```py
-    from bnlp.sentencepiece_tokenizer import SP_Tokenizer
-
-    bsp = SP_Tokenizer()
-    model_path = "./model/bn_spm.model"
-    input_text = "আমি ভাত খাই। সে বাজারে যায়।"
-    tokens = bsp.tokenize(model_path, input_text)
-    print(tokens)
-    text2id = bsp.text2id(model_path, input_text)
-    print(text2id)
-    id2text = bsp.id2text(model_path, text2id)
-    print(id2text)
-
-    ```
-  - Training SentencePiece
-    ```py
-    from bnlp.sentencepiece_tokenizer import SP_Tokenizer
-
-    bsp = SP_Tokenizer()
-    data = "test.txt"
-    model_prefix = "test"
-    vocab_size = 5
-    bsp.train_bsp(data, model_prefix, vocab_size) 
-
-    ```
-
 * **Basic Tokenizer**
 
 
 
   ```py
-  from bnlp.basic_tokenizer import BasicTokenizer
-  basic_t = BasicTokenizer()
+  from bnlp import BasicTokenizer
+  basic_tokenizer = BasicTokenizer()
   raw_text = "আমি বাংলায় গান গাই।"
-  tokens = basic_t.tokenize(raw_text)
+  tokens = basic_tokenizer.tokenize(raw_text)
   print(tokens)
 
   # output: ["আমি", "বাংলায়", "গান", "গাই", "।"]
@@ -130,10 +73,10 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 * **NLTK Tokenization**
 
   ```py
-  from bnlp.nltk_tokenizer import NLTK_Tokenizer
-
+  from bnlp import NLTKTokenizer
+
+  bnltk = NLTKTokenizer()
   text = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ?"
-  bnltk = NLTK_Tokenizer()
   word_tokens = bnltk.word_tokenize(text)
   sentence_tokens = bnltk.sentence_tokenize(text)
   print(word_tokens)
@@ -146,17 +89,48 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   ```
 
 
+* **Bengali SentencePiece Tokenization**
+
+  - tokenization using trained model
+    ```py
+    from bnlp import SentencepieceTokenizer
+
+    bsp = SentencepieceTokenizer()
+    model_path = "./model/bn_spm.model"
+    input_text = "আমি ভাত খাই। সে বাজারে যায়।"
+    tokens = bsp.tokenize(model_path, input_text)
+    print(tokens)
+    text2id = bsp.text2id(model_path, input_text)
+    print(text2id)
+    id2text = bsp.id2text(model_path, text2id)
+    print(id2text)
+
+    ```
+  - Training SentencePiece
+    ```py
+    from bnlp import SentencepieceTokenizer
+
+    bsp = SentencepieceTokenizer()
+    data = "test.txt"
+    model_prefix = "test"
+    vocab_size = 5
+    bsp.train(data, model_prefix, vocab_size) 
+
+    ```
+
+
+
 ## Word Embedding
 
 * **Bengali Word2Vec**
 
   - Generate Vector using pretrain model
 
     ```py
-    from bnlp.bengali_word2vec import Bengali_Word2Vec
+    from bnlp import BengaliWord2Vec
 
-    bwv = Bengali_Word2Vec()
-    model_path = "model/bengali_word2vec.model"
+    bwv = BengaliWord2Vec()
+    model_path = "bengali_word2vec.model"
     word = 'আমার'
     vector = bwv.generate_word_vector(model_path, word)
     print(vector.shape)
@@ -167,40 +141,43 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   - Find Most Similar Word Using Pretrained Model
 
     ```py
-    from bnlp.bengali_word2vec import Bengali_Word2Vec
+    from bnlp import BengaliWord2Vec
 
-    bwv = Bengali_Word2Vec()
-    model_path = "model/bengali_word2vec.model"
-    word = 'আমার'
+    bwv = BengaliWord2Vec()
+    model_path = "bengali_word2vec.model"
+    word = 'গ্রাম'
     similar = bwv.most_similar(model_path, word)
     print(similar)
 
     ```
   - Train Bengali Word2Vec with your own data
 
     ```py
-    from bnlp.bengali_word2vec import Bengali_Word2Vec
-    bwv = Bengali_Word2Vec(True)
-    data_file = "test.txt"
+    from bnlp import BengaliWord2Vec
+    bwv = BengaliWord2Vec()
+    data_file = "sample.txt"
     model_name = "test_model.model"
     vector_name = "test_vector.vector"
-    bwv.train_word2vec(data_file, model_name, vector_name)
+    bwv.train(data_file, model_name, vector_name)
 
 
     ```
 
  * **Bengali FastText**
 
+    To use `fasttext` you need to install fasttext manually by `pip install fasttext==0.9.2`
+
+    NB: `fasttext` may not be worked in `windows`, it will only work in `linux`
 
     - Generate Vector Using Pretrained Model
 
 
       ```py
-      from bnlp.bengali_fasttext import Bengali_Fasttext
+      from bnlp.embedding.fasttext import BengaliFasttext
 
-      bft = Bengali_Fasttext()
+      bft = BengaliFasttext()
       word = "গ্রাম"
-      model_path = "model/bengali_fasttext.bin"
+      model_path = "bengali_fasttext_wiki.bin"
       word_vector = bft.generate_word_vector(model_path, word)
       print(word_vector.shape)
       print(word_vector)
@@ -210,13 +187,13 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     - Train Bengali FastText Model
 
       ```py
-      from bnlp.bengali_fasttext import Bengali_Fasttext
+      from bnlp.embedding.fasttext import BengaliFasttext
 
-      bft = Bengali_Fasttext()
-      data = "data.txt"
+      bft = BengaliFasttext()
+      data = "sample.txt"
       model_name = "saved_model.bin"
       epoch = 50
-      bft.train_fasttext(data, model_name, epoch)
+      bft.train(data, model_name, epoch)
       ```
 
 * **Bengali GloVe Word Vectors**
@@ -225,10 +202,10 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   You can download and use it on your different machine learning purposes.
 
   ```py
-  from bnlp.glove_wordvector import BN_Glove
+  from bnlp import BengaliGlove
   glove_path = "bn_glove.39M.100d.txt"
   word = "গ্রাম"
-  bng = BN_Glove()
+  bng = BengaliGlove()
   res = bng.closest_word(glove_path, word)
   print(res)
   vec = bng.word2vec(glove_path, word)
@@ -243,7 +220,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   - Find Pos Tag Using Pretrained Model
 
     ```py
-    from bnlp.pos import POS
+    from bnlp import POS
     bn_pos = POS()
     model_path = "model/bn_pos.pkl"
     text = "আমি ভাত খাই।"
@@ -255,7 +232,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   - Train POS Tag Model
 
     ```py
-    from bnlp.pos import POS
+    from bnlp import POS
     bn_pos = POS()
     model_name = "pos_model.pkl"
     tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]
@@ -271,7 +248,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   - Find NER Tag Using Pretrained Model
 
     ```py
-    from bnlp.ner import NER
+    from bnlp import NER
     bn_ner = NER()
     model_path = "model/bn_ner.pkl"
     text = "সে ঢাকায় থাকে।"
@@ -283,7 +260,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   - Train NER Tag Model
 
     ```py
-    from bnlp.ner import NER
+    from bnlp import NER
     bn_ner = NER()
     model_name = "ner_model.pkl"
     tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]
@@ -292,17 +269,30 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 
     ```
 
+## Bengali Corpus Class
 
-## Issue
-* if `ModuleNotFoundError: No module named 'fasttext'` problem arise please do the next line
+* Stopwords and Punctuations
+  ```py
+  from bnlp.corpus import stopwords, punctuations
+
+  stopwords = stopwords() 
+  print(stopwords)
+  print(punctuations)
+
+  ```
 
-```pip install fasttext```
-* if `nltk` issue arise please do the following line before importing `bnlp`
+* Remove stopwords from Text
 
-```py
-import nltk
-nltk.download("punkt")
-```
+    ```py
+    from bnlp.corpus import stopwords
+    from bnlp.corpus.util import remove_stopwords
+
+    stopwords = stopwords()
+    raw_text = 'আমি ভাত খাই।' 
+    result = remove_stopwords(raw_text, stopwords)
+    print(result)
+    # ['ভাত', 'খাই', '।']
+    ```
 
 
 ## Contributor Guide

diff --git a/bnlp/__init__.py b/bnlp/__init__.py
@@ -1,15 +1,15 @@
-__version__="2.0.0"
+__version__="3.0.0"
 
 
 import os
-from bnlp.sentencepiece_tokenizer import SP_Tokenizer
-from bnlp.nltk_tokenizer import NLTK_Tokenizer
-from bnlp.basic_tokenizer import BasicTokenizer
-from bnlp.bengali_word2vec import Bengali_Word2Vec
-from bnlp.bengali_fasttext import Bengali_Fasttext
-from bnlp.glove_wordvector import BN_Glove
 from bnlp.pos import POS
 from bnlp.ner import NER
+from bnlp.tokenizer.nltk import NLTKTokenizer
+from bnlp.tokenizer.basic import BasicTokenizer
+from bnlp.tokenizer.sentencepiece import SentencepieceTokenizer
+from bnlp.embedding.word2vec import BengaliWord2Vec
+from bnlp.embedding.glove import BengaliGlove
+
 
 
 
diff --git a/bnlp/__pycache__/__init__.cpython-36.pyc b/bnlp/__pycache__/__init__.cpython-36.pyc
diff --git a/bnlp/__pycache__/basic_tokenizer.cpython-36.pyc b/bnlp/__pycache__/basic_tokenizer.cpython-36.pyc
diff --git a/bnlp/__pycache__/bengali_fasttext.cpython-36.pyc b/bnlp/__pycache__/bengali_fasttext.cpython-36.pyc
diff --git a/bnlp/__pycache__/bengali_pos.cpython-36.pyc b/bnlp/__pycache__/bengali_pos.cpython-36.pyc
diff --git a/bnlp/__pycache__/bengali_word2vec.cpython-36.pyc b/bnlp/__pycache__/bengali_word2vec.cpython-36.pyc
diff --git a/bnlp/__pycache__/glove_wordvector.cpython-36.pyc b/bnlp/__pycache__/glove_wordvector.cpython-36.pyc
diff --git a/bnlp/__pycache__/nltk_tokenizer.cpython-36.pyc b/bnlp/__pycache__/nltk_tokenizer.cpython-36.pyc
diff --git a/bnlp/__pycache__/sentencepiece_tokenizer.cpython-36.pyc b/bnlp/__pycache__/sentencepiece_tokenizer.cpython-36.pyc
diff --git a/bnlp/corpus/__init__.py b/bnlp/corpus/__init__.py
@@ -0,0 +1,20 @@
+# BNLP Corpus Reader
+# Author: Sagor Sarker
+
+"""
+The module in th is package will provide you function that can used to read corpus.
+
+Available Corpus: 
+- Bengali Stopwords
+    Collected from: https://github.com/stopwords-iso/stopwords-bn
+
+
+"""
+
+from bnlp.corpus.util import stopwords
+# return list of bengali stopwords
+stopwords = stopwords
+
+# return list of bengali punctuation
+punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~।ঃ'
+
diff --git a/bnlp/corpus/util.py b/bnlp/corpus/util.py
@@ -0,0 +1,23 @@
+from bnlp.tokenizer.basic import BasicTokenizer
+
+def stopwords():
+  word_list = ['অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য', 'অর্থাত', 'আই', 'আগামী', 'আগে', 'আগেই', 'আছে', 'আজ', 'আদ্যভাগে', 'আপনার', 'আপনি', 'আবার', 'আমরা', 'আমাকে', 'আমাদের', 'আমার', 'আমি', 'আর', 'আরও', 'ই', 'ইত্যাদি', 'ইহা', 'উচিত', 'উত্তর', 'উনি', 'উপর', 'উপরে', 'এ', 'এঁদের', 'এঁরা', 'এই', 'একই', 'একটি', 'একবার', 'একে', 'এক্', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটা', 'এটাই', 'এটি', 'এত', 'এতটাই', 'এতে', 'এদের', 'এব', 'এবং', 'এবার', 'এমন', 'এমনকী', 'এমনি', 'এর', 'এরা', 'এল', 'এস', 'এসে', 'ঐ', 'ও', 'ওঁদের', 'ওঁর', 'ওঁরা', 'ওই', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'ওরা', 'কখনও', 'কত', 'কবে', 'কমনে', 'কয়েক', 'কয়েকটি', 'করছে', 'করছেন', 'করতে', 'করবে', 'করবেন', 'করলে', 'করলেন', 'করা', 'করাই', 'করায়', 'করার', 'করি', 'করিতে', 'করিয়া', 'করিয়ে', 'করে', 'করেই', 'করেছিলেন', 'করেছে', 'করেছেন', 'করেন', 'কাউকে', 'কাছ', 'কাছে', 'কাজ', 'কাজে', 'কারও', 'কারণ', 'কি', 'কিংবা', 'কিছু', 'কিছুই', 'কিন্তু', 'কী', 'কে', 'কেউ', 'কেউই', 'কেখা', 'কেন', 'কোটি', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'কয়েক', 'খুব', 'গিয়ে', 'গিয়েছে', 'গিয়ে', 'গুলি', 'গেছে', 'গেল', 'গেলে', 'গোটা', 'চলে', 'চান', 'চায়', 'চার', 'চালু', 'চেয়ে', 'চেষ্টা', 'ছাড়া', 'ছাড়াও', 'ছিল', 'ছিলেন', 'জন', 'জনকে', 'জনের', 'জন্য', 'জন্যওজে', 'জানতে', 'জানা', 'জানানো', 'জানায়', 'জানিয়ে', 'জানিয়েছে', 'জে', 'জ্নজন', 'টি', 'ঠিক', 'তখন', 'তত', 'তথা', 'তবু', 'তবে', 'তা', 'তাঁকে', 'তাঁদের', 'তাঁর', 'তাঁরা', 'তাঁাহারা', 'তাই', 'তাও', 'তাকে', 'তাতে', 'তাদের', 'তার', 'তারপর', 'তারা', 'তারৈ', 'তাহলে', 'তাহা', 'তাহাতে', 'তাহার', 'তিনঐ', 'তিনি', 'তিনিও', 'তুমি', 'তুলে', 'তেমন', 'তো', 'তোমার', 'থাকবে', 'থাকবেন', 'থাকা', 'থাকায়', 'থাকে', 'থাকেন', 'থেকে', 'থেকেই', 'থেকেও', 'দিকে', 'দিতে', 'দিন', 'দিয়ে', 'দিয়েছে', 'দিয়েছেন', 'দিলেন', 'দু', 'দুই', 'দুটি', 'দুটো', 'দেওয়া', 'দেওয়ার', 'দেওয়া', 'দেখতে', 'দেখা', 'দেখে', 'দেন', 'দেয়', 'দ্বারা', 'ধরা', 'ধরে', 'ধামার', 'নতুন', 'নয়', 'না', 'নাই', 'নাকি', 'নাগাদ', 'নানা', 'নিজে', 'নিজেই', 'নিজেদের', 'নিজের', 'নিতে', 'নিয়ে', 'নিয়ে', 'নেই', 'নেওয়া', 'নেওয়ার', 'নেওয়া', 'নয়', 'পক্ষে', 'পর', 'পরে', 'পরেই', 'পরেও', 'পর্যন্ত', 'পাওয়া', 'পাচ', 'পারি', 'পারে', 'পারেন', 'পি', 'পেয়ে', 'পেয়্র্', 'প্রতি', 'প্রথম', 'প্রভৃতি', 'প্রযন্ত', 'প্রাথমিক', 'প্রায়', 'প্রায়', 'ফলে', 'ফিরে', 'ফের', 'বক্তব্য', 'বদলে', 'বন', 'বরং', 'বলতে', 'বলল', 'বললেন', 'বলা', 'বলে', 'বলেছেন', 'বলেন', 'বসে', 'বহু', 'বা', 'বাদে', 'বার', 'বি', 'বিনা', 'বিভিন্ন', 'বিশেষ', 'বিষয়টি', 'বেশ', 'বেশি', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মতো', 'মতোই', 'মধ্যভাগে', 'মধ্যে', 'মধ্যেই', 'মধ্যেও', 'মনে', 'মাত্র', 'মাধ্যমে', 'মোট', 'মোটেই', 'যখন', 'যত', 'যতটা', 'যথেষ্ট', 'যদি', 'যদিও', 'যা', 'যাঁর', 'যাঁরা', 'যাওয়া', 'যাওয়ার', 'যাওয়া', 'যাকে', 'যাচ্ছে', 'যাতে', 'যাদের', 'যান', 'যাবে', 'যায়', 'যার', 'যারা', 'যিনি', 'যে', 'যেখানে', 'যেতে', 'যেন', 'যেমন', 'র', 'রকম', 'রয়েছে', 'রাখা', 'রেখে', 'লক্ষ', 'শুধু', 'শুরু', 'সঙ্গে', 'সঙ্গেও', 'সব', 'সবার', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সহিত', 'সাধারণ', 'সামনে', 'সি', 'সুতরাং', 'সে', 'সেই', 'সেখান', 'সেখানে', 'সেটা', 'সেটাই', 'সেটাও', 'সেটি', 'স্পষ্ট', 'স্বয়ং', 'হইতে', 'হইবে', 'হইয়া', 'হওয়া', 'হওয়ায়', 'হওয়ার', 'হচ্ছে', 'হত', 'হতে', 'হতেই', 'হন', 'হবে', 'হবেন', 'হয়', 'হয়তো', 'হয়নি', 'হয়ে', 'হয়েই', 'হয়েছিল', 'হয়েছে', 'হয়েছেন', 'হল', 'হলে', 'হলেই', 'হলেও', 'হলো', 'হাজার', 'হিসাবে', 'হৈলে', 'হোক', 'হয়']
+  return word_list
+
+def remove_stopwords(text, stopwords):
+  """
+  This function remove stopwords from text
+  parameters:
+    text: str
+    stopwords: list
+  return: tokens of word without stopwords
+
+  """
+  tokenizer = BasicTokenizer()
+  words = tokenizer.tokenize(text)
+  filtered_words = [w for w in words if not w in stopwords]
+  return filtered_words
+
+
+
+
diff --git a/bnlp/embedding/__init__.py b/bnlp/embedding/__init__.py