Skip to content

Commit 2c44d79

Browse files
Merge pull request #6 from ashwoolford/v0.7.7
V0.7.7
2 parents b7e80cb + 2ff1026 commit 2c44d79

38 files changed

+1953
-550
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
.DS_Store
2-
__pycache__
2+
__pycache__
3+
myenv

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
language: python
22

33
python:
4-
- 3.6
4+
- 3.10
55

66
script: pytest

README.md

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
# BNLTK
22
[![Build Status](https://travis-ci.org/ashwoolford/bnltk.svg?branch=master)](https://travis-ci.org/ashwoolford/bnltk)
33
[![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)
4+
[![Downloads](https://static.pepy.tech/badge/bnltk)](https://pepy.tech/project/bnltk)
45

56

6-
BNLTK(Bangla Natural Language Processing Toolkit) is open-source python package for Bengali Natural Language Processing. It includes modules for Tokenization, Stemming, Parts of speech tagging. I'm looking forward to helping form contributors to make this look far better than this.
7+
8+
BNLTK(Bangla Natural Language Processing Toolkit) is a open-source python package for Bengali Natural Language Processing. It includes modules for Tokenization, Stemming, Parts of speech tagging.
79

810
## installation
911

12+
```
1013
pip install bnltk
14+
```
1115

1216
## Usage
1317

@@ -16,33 +20,68 @@ pip install bnltk
1620
```
1721
from bnltk.tokenize import Tokenizers
1822
t = Tokenizers()
19-
print(t.bn_word_tokenizer(' আমার সোনার বাংলা । '))
23+
print(t.bn_word_tokenizer('আজ আবহাওয়া খুব ভালো।'))
24+
# ["আজ", "আবহাওয়া", "খুব", "ভালো", "।"]
2025
```
2126

2227
### Stemmer
2328

2429
```
2530
from bnltk.stemmer import BanglaStemmer
2631
bn_stemmer = BanglaStemmer()
27-
print(bn_stemmer.stem('খেয়েছিলো'))
32+
print(bn_stemmer.stem('হেসেছিলেন'))
33+
# হাসা
2834
```
2935

3036
### Parts of Tagger
3137

32-
For using the Parts of Tagger you need to download some data files as follows:
33-
38+
To use the Parts of Speech Tagger, please download the pretrained model's weights. Our trained model achieves an accuracy of 96%
3439
```
3540
from bnltk.bnltk_downloads import DataFiles
3641
DataFiles().download()
3742
```
38-
After successfully downloading the files, then you can use this module.
43+
After successfully downloading the files, you can use this module as follows:
3944

4045
```
4146
from bnltk.pos_tagger import PosTagger
4247
43-
p_tagger = PosTagger()
44-
p_tagger.loader()
45-
sentences = 'দুশ্চিন্তার কোন কারণই নাই'
46-
print(p_tagger.tagger(sentences))
47-
48+
p_tagger = PosTagger()
49+
print(p_tagger.tagger('দুশ্চিন্তার কোন কারণই নাই'))
50+
# [('দুশ্চিন্তার', 'NC'), ('কোন', 'JQ'), ('কারণই', 'NC'), ('নাই', 'VM')]
4851
```
52+
53+
Description of the POS tag set
54+
55+
| Categories | Types |
56+
|-----------------------|-----------------------|
57+
| Noun (N) | Common (NC) |
58+
| | Proper (NP) |
59+
| | Verbal (NV) |
60+
| | Spatio-temporal (NST) |
61+
| Pronoun (P) | Pronominal (PPR) |
62+
| | Reflexive (PRF) |
63+
| | Reciprocal (PRC) |
64+
| | Relative (PRL) |
65+
| | Wh (PWH) |
66+
| | |
67+
| Nominal Modifier (J) | Adjectives (JJ) |
68+
| | Quantifiers (JQ) |
69+
| Demonstratives (D) | Absolutive (DAB) |
70+
| | Relative (DRL) |
71+
| | Wh (DWH) |
72+
| Adverb (A) | Manner (AMN) |
73+
| | Location (ALC) |
74+
| Participle (L) | Relative (LRL) |
75+
| | Verbal (LV) |
76+
| Postposition (PP) | |
77+
| Particles (C) | Coordinating (CCD) |
78+
| | Subordinating (CSB) |
79+
| | Classifier (CCL) |
80+
| | Interjection (CIN) |
81+
| | Others (CX) |
82+
| Punctuations (PU) | |
83+
| Residual (RD) | Foreign Word (RDF) |
84+
| | Symbol (RDS) |
85+
| | Other (RDX) |
86+
87+

bnltk_downloads/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from .file_downloader import DataFiles
1+
from .file_downloader import DataFiles

bnltk_downloads/file_downloader.py

Lines changed: 52 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,70 @@
11
# Bangla Natural Language Toolkit: DataFilles Downloader
22
#
3-
# Copyright (C) 2019 BNLTK Project
4-
# Author: Ashraf Hossain <asrafhossain197@gmail.com>
3+
# Copyright (C) 2019-2024 BNLTK Project
4+
# Author: Asraf Patoary <asrafhossain197@gmail.com>
55

6-
from requests import get # to make GET request
6+
from requests import get
77
import platform
88
import getpass
99
import os
1010
import sys
1111

1212

1313
class DataFiles:
14-
def __init__(self):
15-
pass
14+
def __init__(self):
15+
pass
1616

17-
def downloader(self, url, file_name, tag):
18-
if not os.path.exists(file_name):
19-
# open in binary mode
20-
with open(file_name, "wb") as file:
21-
# get request
22-
print("Downloading....../"+tag)
23-
response = get(url, stream=True)
24-
# write to file
25-
#file.write(response.content)
26-
27-
28-
total_length = response.headers.get('content-length')
17+
def downloader(self, url, file_name, tag):
18+
if not os.path.exists(file_name):
19+
with open(file_name, "wb") as file:
20+
print("Downloading....../" + tag)
21+
response = get(url, stream=True)
22+
total_length = response.headers.get("content-length")
2923

30-
if total_length is None: # no content length header
31-
file.write(response.content)
32-
else:
33-
dl = 0
34-
total_length = int(total_length)
35-
for data in response.iter_content(chunk_size=4096):
36-
dl += len(data)
37-
file.write(data)
38-
done = int(50 * dl / total_length)
39-
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
40-
sys.stdout.flush()
41-
else:
42-
print(tag + 'is already exists!!')
24+
if total_length is None:
25+
file.write(response.content)
26+
else:
27+
dl = 0
28+
total_length = int(total_length)
29+
for data in response.iter_content(chunk_size=4096):
30+
dl += len(data)
31+
file.write(data)
32+
done = int(50 * dl / total_length)
33+
sys.stdout.write("\r[%s%s]" % ("=" * done, " " * (50 - done)))
34+
sys.stdout.flush()
35+
else:
36+
print(tag + "is already exists!!")
4337

44-
38+
def download(self):
39+
file_name = None
40+
tag1 = "bn_tagged_mod.txt"
41+
tag2 = "pos_tagger.weights.h5"
4542

46-
def download(self):
47-
file_name = None
48-
tag1 = 'bn_tagged_mod.txt'
49-
tag2 = 'keras_mlp_bangla.h5'
43+
print("platform.system() ", platform.system())
5044

51-
if platform.system() == 'Windows':
52-
file_name = "C:\\Users\\"+getpass.getuser()
53-
else:
54-
file_name = "/Users/"+getpass.getuser()
55-
#print(file_name)
56-
url = 'https://firebasestorage.googleapis.com/v0/b/diu-question.appspot.com/o/nlp_data%2Fbn_tagged_mod.txt?alt=media&token=00f383a3-f913-480b-85c1-971dd8fd6dd9'
57-
url2 = 'https://firebasestorage.googleapis.com/v0/b/diu-question.appspot.com/o/nlp_data%2Fkeras_mlp_bangla.h5?alt=media&token=4146c1b0-1e4d-4f9e-8b2f-7e3519106a40'
45+
if platform.system() == "Windows":
46+
file_name = "C:\\Users\\" + getpass.getuser()
47+
elif platform.system() == "Linux":
48+
file_name = "/home/" + getpass.getuser()
49+
elif platform.system() == "Darwin":
50+
file_name = "/Users/" + getpass.getuser()
51+
else:
52+
raise Exception("Unable to detect OS")
5853

54+
corpus_url = "https://firebasestorage.googleapis.com/v0/b/diu-question.appspot.com/o/nlp_data%2Fbn_tagged_mod.txt?alt=media&token=00f383a3-f913-480b-85c1-971dd8fd6dd9"
55+
saved_weights_url = "https://firebasestorage.googleapis.com/v0/b/diu-question.appspot.com/o/nlp_data%2Fpos_tagger.weights.h5?alt=media&token=2251eedd-dfaf-4572-9bce-b4d293cce980"
5956

60-
try:
61-
os.makedirs(file_name+'/bnltk_data/pos_data')
62-
except OSError:
63-
print ("Creation of the directory failed or exists")
64-
else:
65-
pass
66-
67-
self.downloader(url, file_name+'/bnltk_data/pos_data/bn_tagged_mod.txt', tag1)
68-
print()
69-
self.downloader(url2, file_name+'/bnltk_data/pos_data/keras_mlp_bangla.h5', tag2)
70-
print('Done!')
57+
try:
58+
os.makedirs(file_name + "/bnltk_data/pos_data")
59+
except OSError:
60+
print("Creation of the directory failed or exists")
7161

62+
self.downloader(
63+
corpus_url, file_name + "/bnltk_data/pos_data/bn_tagged_mod.txt", tag1
64+
)
65+
self.downloader(
66+
saved_weights_url,
67+
file_name + "/bnltk_data/pos_data/pos_tagger.weights.h5",
68+
tag2,
69+
)
70+
print("Done!")

pos_tagger/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
from .bn_pos_tagger import PosTagger
2-
# h

0 commit comments

Comments
 (0)