Merge pull request #20 from sagorbrur/dev

merging dev with master for bnlp v3.1.0
sagorbrur · Apr 24, 2021 · 5d34e0a · 5d34e0a
2 parents 70984dd + 24eeb63
commit 5d34e0a
Show file tree

Hide file tree

Showing 37 changed files with 430 additions and 82 deletions.
diff --git a/.github/stale.yml b/.github/stale.yml
@@ -0,0 +1,17 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
diff --git a/.travis.yml b/.travis.yml
@@ -10,19 +10,20 @@ os:
 
 # Set the python version to 3.6, 3.7
 python:
-  - "3.5"
   - "3.6"
   - "3.7"
   - "3.8"
 
 # Install the pip dependency
 install:
   - pip install sentencepiece
-  - pip install gensim
+  - pip install gensim==4.0.1
   - pip install nltk
   - pip install numpy
   - pip install scipy
   - pip install sklearn-crfsuite
+  - pip install wasabi
+  - pip install python-Levenshtein
 
 # Run the unit test
 script:

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "restructuredtext.confPath": ""
+}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -36,7 +36,7 @@ We hosted our code repositories in Github for better management of issues and de
 - Clone your forked repository locally
   (`git clone https://github.com/<your-github-username>/bnlp.git`);
 - Run `cd bnlp` to get to the root directory of the `bnlp` code base;
-- Install the dependencies (`pip install -r requirements.txt`);
+- checkout `dev` branch by `git checkout dev`
 - Download the pretrianed models for running tests
   (you can find the pretrained model details [here](https://github.com/sagorbrur/bnlp) in Readme
 

diff --git a/README.md b/README.md
@@ -3,9 +3,10 @@
 # Bengali Natural Language Processing(BNLP)
 
 [![Build Status](https://travis-ci.org/sagorbrur/bnlp.svg?branch=master)](https://travis-ci.org/sagorbrur/bnlp)
+[![arXiv](https://img.shields.io/badge/arXiv-2102.00405-b31b1b)](https://arxiv.org/abs/2102.00405)
 [![PyPI version](https://img.shields.io/pypi/v/bnlp_toolkit)](https://pypi.org/project/bnlp-toolkit/)
 [![release version](https://img.shields.io/github/v/release/sagorbrur/bnlp)](https://github.com/sagorbrur/bnlp/releases/tag/2.0.0)
-[![Support Python Version](https://img.shields.io/badge/python-3.5%7C3.6%7C3.7%7C3.8-brightgreen)](https://pypi.org/project/bnlp-toolkit/)
+[![Support Python Version](https://img.shields.io/badge/python-3.6%7C3.7%7C3.8-brightgreen)](https://pypi.org/project/bnlp-toolkit/)
 [![Documentation Status](https://readthedocs.org/projects/bnlp/badge/?version=latest)](https://bnlp.readthedocs.io/en/latest/?badge=latest)
 [![Gitter](https://badges.gitter.im/bnlp_toolkit/community.svg)](https://gitter.im/bnlp_toolkit/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
 
@@ -15,7 +16,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 
 ## Installation
 
-### PIP installer(Python: 3.5, 3.6, 3.7, 3.8 tested okay, OS: linux, windows tested okay )
+### PIP installer(Python: 3.6, 3.7, 3.8 tested okay, OS: linux, windows tested okay )
 
   ```
   pip install bnlp_toolkit
@@ -34,7 +35,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 ### Download Link
 
 * [Bengali SentencePiece](https://github.com/sagorbrur/bnlp/tree/master/model)
-* [Bengali Word2Vec](https://drive.google.com/open?id=1DxR8Vw61zRxuUm17jzFnOX97j7QtNW7U)
+* [Bengali Word2Vec](https://drive.google.com/file/d/1cQ8AoSdiX5ATYOzcTjCqpLCV1efB9QzT/view?usp=sharing)
 * [Bengali FastText](https://drive.google.com/open?id=1CFA-SluRyz3s5gmGScsFUcs7AjLfscm2)
 * [Bengali GloVe Wordvectors](https://github.com/sagorbrur/GloVe-Bengali)
 * [Bengali POS Tag model](https://github.com/sagorbrur/bnlp/blob/master/model/bn_pos.pkl)
@@ -45,7 +46,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
   - [Bengali Wiki Dump](https://dumps.wikimedia.org/bnwiki/latest/)
 * SentencePiece Training Vocab Size=50000
 * Fasttext trained with total words = 20M, vocab size = 1171011, epoch=50, embedding dimension = 300 and the training loss = 0.318668,
-* Word2Vec word embedding dimension = 300
+* Word2Vec word embedding dimension = 100, min_count=5, window=5, epochs=10
 * To Know Bengali GloVe Wordvector and training process follow [this](https://github.com/sagorbrur/GloVe-Bengali) repository
 * Bengali CRF POS Tagging was training with [nltr](https://github.com/abhishekgupta92/bangla_pos_tagger/tree/master/data) dataset with 80% accuracy. 
 * Bengali CRF NER Tagging was train with [this](https://github.com/MISabic/NER-Bangla-Dataset) data with 90% accuracy.
@@ -129,7 +130,7 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 
     bwv = BengaliWord2Vec()
     model_path = "bengali_word2vec.model"
-    word = 'আমার'
+    word = 'গ্রাম'
     vector = bwv.generate_word_vector(model_path, word)
     print(vector.shape)
     print(vector)
@@ -144,20 +145,43 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     bwv = BengaliWord2Vec()
     model_path = "bengali_word2vec.model"
     word = 'গ্রাম'
-    similar = bwv.most_similar(model_path, word)
+    similar = bwv.most_similar(model_path, word, topn=10)
     print(similar)
 
     ```
   - Train Bengali Word2Vec with your own data
 
+    Train Bengali word2vec with your custom raw data or tokenized sentences.
+
+    custom tokenized sentence format example:
+    ```
+    sentences = [['আমি', 'ভাত', 'খাই', '।'], ['সে', 'বাজারে', 'যায়', '।']]
+    ```
+    Check [gensim word2vec api](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec) for details of training parameter
+
     ```py
     from bnlp import BengaliWord2Vec
     bwv = BengaliWord2Vec()
-    data_file = "raw_text.txt"
+    data_file = "raw_text.txt" # or you can pass custom sentence tokens as list of list
     model_name = "test_model.model"
     vector_name = "test_vector.vector"
-    bwv.train(data_file, model_name, vector_name)
+    bwv.train(data_file, model_name, vector_name, epochs=5)
+
+
+    ```
+  - Pre-train or resume word2vec training with same or new corpus or tokenized sentences
+
+    Check [gensim word2vec api](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec) for details of training parameter
+
+    ```py
+    from bnlp import BengaliWord2Vec
+    bwv = BengaliWord2Vec()
 
+    trained_model_path = "mytrained_model.model"
+    data_file = "raw_text.txt"
+    model_name = "test_model.model"
+    vector_name = "test_vector.vector"
+    bwv.pretrain(trained_model_path, data_file, model_name, vector_name, epochs=5)
 
     ```
 
@@ -184,6 +208,8 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
       ```
     - Train Bengali FastText Model
 
+      Check [fasttext documentation](https://fasttext.cc/docs/en/options.html) for details of training parameter
+
       ```py
       from bnlp.embedding.fasttext import BengaliFasttext
 
@@ -194,6 +220,17 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
       bft.train(data, model_name, epoch)
       ```
 
+    - Generate Vector File from Fasttext Binary Model
+      ```py
+      from bnlp.embedding.fasttext import BengaliFasttext
+
+      bft = BengaliFasttext()
+
+      model_path = "mymodel.bin"
+      out_vector_name = "myvector.txt"
+      bft.bin2vec(model_path, out_vector_name)
+      ```
+
 * **Bengali GloVe Word Vectors**
 
   We trained glove model with bengali data(wiki+news articles) and published bengali glove word vectors</br>
@@ -267,15 +304,17 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
 
     ```
 
+
 ## Bengali Corpus Class
 
 * Stopwords and Punctuations
   ```py
-  from bnlp.corpus import stopwords, punctuations
+  from bnlp.corpus import stopwords, punctuations, letters, digits
 
-  stopwords = stopwords() 
   print(stopwords)
   print(punctuations)
+  print(letters)
+  print(digits)
 
   ```
 
@@ -285,7 +324,6 @@ BNLP is a natural language processing toolkit for Bengali Language. This tool wi
     from bnlp.corpus import stopwords
     from bnlp.corpus.util import remove_stopwords
 
-    stopwords = stopwords()
     raw_text = 'আমি ভাত খাই।' 
     result = remove_stopwords(raw_text, stopwords)
     print(result)

diff --git a/bnlp/__init__.py b/bnlp/__init__.py
@@ -1,4 +1,4 @@
-__version__="3.0.0"
+__version__="3.1.0"
 
 
 import os
@@ -13,3 +13,4 @@
 
 
 
+
diff --git a/bnlp/__pycache__/__init__.cpython-37.pyc b/bnlp/__pycache__/__init__.cpython-37.pyc
diff --git a/bnlp/__pycache__/__init__.cpython-38.pyc b/bnlp/__pycache__/__init__.cpython-38.pyc
diff --git a/bnlp/__pycache__/ner.cpython-37.pyc b/bnlp/__pycache__/ner.cpython-37.pyc
diff --git a/bnlp/__pycache__/ner.cpython-38.pyc b/bnlp/__pycache__/ner.cpython-38.pyc
diff --git a/bnlp/__pycache__/pos.cpython-37.pyc b/bnlp/__pycache__/pos.cpython-37.pyc
diff --git a/bnlp/__pycache__/pos.cpython-38.pyc b/bnlp/__pycache__/pos.cpython-38.pyc
diff --git a/bnlp/__pycache__/sentiment_analysis.cpython-38.pyc b/bnlp/__pycache__/sentiment_analysis.cpython-38.pyc
diff --git a/bnlp/corpus/__init__.py b/bnlp/corpus/__init__.py
@@ -8,13 +8,75 @@
 - Bengali Stopwords
     Collected from: https://github.com/stopwords-iso/stopwords-bn
 
+- Bengali letters and vowel mark 
+    collected from https://github.com/MinhasKamal/BengaliDictionary/blob/master/BengaliCharacterCombinations.txt
 
 """
 
-from bnlp.corpus.util import stopwords
+
 # return list of bengali stopwords
-stopwords = stopwords
+stopwords = [
+            'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 
+            'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য', 'অর্থাত', 'আই', 
+            'আগামী', 'আগে', 'আগেই', 'আছে', 'আজ', 'আদ্যভাগে', 'আপনার', 
+            'আপনি', 'আবার', 'আমরা', 'আমাকে', 'আমাদের', 'আমার', 'আমি', 
+            'আর', 'আরও', 'ই', 'ইত্যাদি', 'ইহা', 'উচিত', 'উত্তর', 'উনি', 
+            'উপর', 'উপরে', 'এ', 'এঁদের', 'এঁরা', 'এই', 'একই', 'একটি', 
+            'একবার', 'একে', 'এক্', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 
+            'এটা', 'এটাই', 'এটি', 'এত', 'এতটাই', 'এতে', 'এদের', 'এব', 
+            'এবং', 'এবার', 'এমন', 'এমনকী', 'এমনি', 'এর', 'এরা', 'এল', 
+            'এস', 'এসে', 'ঐ', 'ও', 'ওঁদের', 'ওঁর', 'ওঁরা', 'ওই', 'ওকে', 
+            'ওখানে', 'ওদের', 'ওর', 'ওরা', 'কখনও', 'কত', 'কবে', 'কমনে', 
+            'কয়েক', 'কয়েকটি', 'করছে', 'করছেন', 'করতে', 'করবে', 'করবেন', 
+            'করলে', 'করলেন', 'করা', 'করাই', 'করায়', 'করার', 'করি', 
+            'করিতে', 'করিয়া', 'করিয়ে', 'করে', 'করেই', 'করেছিলেন', 'করেছে', 
+            'করেছেন', 'করেন', 'কাউকে', 'কাছ', 'কাছে', 'কাজ', 'কাজে', 
+            'কারও', 'কারণ', 'কি', 'কিংবা', 'কিছু', 'কিছুই', 'কিন্তু', 'কী', 
+            'কে', 'কেউ', 'কেউই', 'কেখা', 'কেন', 'কোটি', 'কোন', 'কোনও', 
+            'কোনো', 'ক্ষেত্রে', 'কয়েক', 'খুব', 'গিয়ে', 'গিয়েছে', 'গিয়ে', 'গুলি', 'গেছে', 
+            'গেল', 'গেলে', 'গোটা', 'চলে', 'চান', 'চায়', 'চার', 'চালু', 'চেয়ে', 
+            'চেষ্টা', 'ছাড়া', 'ছাড়াও', 'ছিল', 'ছিলেন', 'জন', 'জনকে', 'জনের', 
+            'জন্য', 'জন্যওজে', 'জানতে', 'জানা', 'জানানো', 'জানায়', 'জানিয়ে', 
+            'জানিয়েছে', 'জে', 'জ্নজন', 'টি', 'ঠিক', 'তখন', 'তত', 'তথা', 'তবু', 
+            'তবে', 'তা', 'তাঁকে', 'তাঁদের', 'তাঁর', 'তাঁরা', 'তাঁাহারা', 'তাই', 'তাও', 
+            'তাকে', 'তাতে', 'তাদের', 'তার', 'তারপর', 'তারা', 'তারৈ', 'তাহলে', 
+            'তাহা', 'তাহাতে', 'তাহার', 'তিনঐ', 'তিনি', 'তিনিও', 'তুমি', 'তুলে', 
+            'তেমন', 'তো', 'তোমার', 'থাকবে', 'থাকবেন', 'থাকা', 'থাকায়', 'থাকে', 
+            'থাকেন', 'থেকে', 'থেকেই', 'থেকেও', 'দিকে', 'দিতে', 'দিন', 'দিয়ে', 
+            'দিয়েছে', 'দিয়েছেন', 'দিলেন', 'দু', 'দুই', 'দুটি', 'দুটো', 'দেওয়া', 'দেওয়ার', 
+            'দেওয়া', 'দেখতে', 'দেখা', 'দেখে', 'দেন', 'দেয়', 'দ্বারা', 'ধরা', 'ধরে', 
+            'ধামার', 'নতুন', 'নয়', 'না', 'নাই', 'নাকি', 'নাগাদ', 'নানা', 'নিজে', 
+            'নিজেই', 'নিজেদের', 'নিজের', 'নিতে', 'নিয়ে', 'নিয়ে', 'নেই', 'নেওয়া', 
+            'নেওয়ার', 'নেওয়া', 'নয়', 'পক্ষে', 'পর', 'পরে', 'পরেই', 'পরেও', 'পর্যন্ত', 
+            'পাওয়া', 'পাচ', 'পারি', 'পারে', 'পারেন', 'পি', 'পেয়ে', 'পেয়্র্', 'প্রতি', 
+            'প্রথম', 'প্রভৃতি', 'প্রযন্ত', 'প্রাথমিক', 'প্রায়', 'প্রায়', 'ফলে', 'ফিরে', 'ফের', 
+            'বক্তব্য', 'বদলে', 'বন', 'বরং', 'বলতে', 'বলল', 'বললেন', 'বলা', 'বলে', 
+            'বলেছেন', 'বলেন', 'বসে', 'বহু', 'বা', 'বাদে', 'বার', 'বি', 'বিনা', 'বিভিন্ন', 
+            'বিশেষ', 'বিষয়টি', 'বেশ', 'বেশি', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 
+            'মতো', 'মতোই', 'মধ্যভাগে', 'মধ্যে', 'মধ্যেই', 'মধ্যেও', 'মনে', 'মাত্র', 
+            'মাধ্যমে', 'মোট', 'মোটেই', 'যখন', 'যত', 'যতটা', 'যথেষ্ট', 'যদি', 'যদিও', 
+            'যা', 'যাঁর', 'যাঁরা', 'যাওয়া', 'যাওয়ার', 'যাওয়া', 'যাকে', 'যাচ্ছে', 'যাতে', 
+            'যাদের', 'যান', 'যাবে', 'যায়', 'যার', 'যারা', 'যিনি', 'যে', 'যেখানে', 'যেতে', 
+            'যেন', 'যেমন', 'র', 'রকম', 'রয়েছে', 'রাখা', 'রেখে', 'লক্ষ', 'শুধু', 'শুরু', 
+            'সঙ্গে', 'সঙ্গেও', 'সব', 'সবার', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সহিত', 'সাধারণ', 
+            'সামনে', 'সি', 'সুতরাং', 'সে', 'সেই', 'সেখান', 'সেখানে', 'সেটা', 'সেটাই', 
+            'সেটাও', 'সেটি', 'স্পষ্ট', 'স্বয়ং', 'হইতে', 'হইবে', 'হইয়া', 'হওয়া', 'হওয়ায়', 
+            'হওয়ার', 'হচ্ছে', 'হত', 'হতে', 'হতেই', 'হন', 'হবে', 'হবেন', 'হয়', 'হয়তো', 
+            'হয়নি', 'হয়ে', 'হয়েই', 'হয়েছিল', 'হয়েছে', 'হয়েছেন', 'হল', 'হলে', 'হলেই', 
+            'হলেও', 'হলো', 'হাজার', 'হিসাবে', 'হৈলে', 'হোক', 'হয়'
+        ]
 
 # return list of bengali punctuation
 punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~।ঃ'
 
+# return bangla letters
+letters = 'অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ৎংঃঁ'
+
+# return bangla digits
+digits = '০১২৩৪৫৬৭৮৯'
+
+# bengali vower mark
+vower_mark = 'া ি ী ু ৃ ে ৈ ো ৌ'
+
+
+
diff --git a/bnlp/corpus/__pycache__/__init__.cpython-37.pyc b/bnlp/corpus/__pycache__/__init__.cpython-37.pyc