Skip to content

Commit

Permalink
add documentation and uncomment all configurations
Browse files Browse the repository at this point in the history
  • Loading branch information
yoseflaw committed Sep 11, 2020
1 parent a70e55e commit 0941b6a
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.env
.env*
.idea
.DS_store
references
Expand Down
31 changes: 28 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,32 @@
## nerindo
## Nerindo

Named Entity Recognition for Indonesian NER with PyTorch.
Named Entity Recognition for Bahasa Indonesia NER with PyTorch.

Corpus for NER:
* https://github.com/yohanesgultom/nlp-experiments
* https://github.com/yusufsyaifudin/indonesia-ner
* https://github.com/yusufsyaifudin/indonesia-ner

The step-by-step implementation in Google Colab is indexed [here](https://medium.com/@yoseflaw/step-by-step-ner-model-for-bahasa-indonesia-with-pytorch-and-torchtext-6f94fca08406?source=friends_link&sk=c15c89082c00c8785577e1cebb77c9c2).

The Fine-tuned Indonesian word embeddings `id_ft.bin` is available [here](https://drive.google.com/file/d/1BGWnSHGZXdPfVCCkvx3_ZbjNnKh2t9pF/view?usp=sharing), based on word embeddings trained in [indonesian-word-embedding](https://github.com/galuhsahid/indonesian-word-embedding).

### Included configurations
1. BiLSTM
2. BiLSTM + Word Embeddings
3. BiLSTM + Word Embeddings + Char Embeddings (CNN)
4. BiLSTM + Word Embeddings + Char Embeddings (CNN) + Attention Layer
5. Transformer (simplified BERT) + Word Embeddings + Char Embeddings (CNN)

Automatic learning rate finder based on [pytorch-lr-finder](https://github.com/davidtvs/pytorch-lr-finder).

### Example output for learning rate finder

<img src="https://github.com/yoseflaw/nerindo/blob/master/images/lr_finder.png" alt="LR Finder Example Output"/>

### Final result

<img src="https://github.com/yoseflaw/nerindo/blob/master/images/final_result.png" alt="LR Finder Example Output"/>

### Main reference

Gunawan, W., Suhartono, D., Purnomo, F., & Ongko, A. (2018). Named-entity recognition for indonesian language using bidirectional lstm-cnns. Procedia Computer Science, 135, 425-432.
Binary file added images/final_result.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/lr_finder.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from nerindo.models import NERModel
from nerindo.lr_finder import LRFinder
from nerindo.trainer import Trainer
from pprint import pprint

if __name__ == "__main__":
use_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand Down Expand Up @@ -58,7 +59,7 @@
lr_finder = LRFinder(model, Adam(model.parameters(), lr=1e-4, weight_decay=1e-2), device=use_device)
lr_finder.range_test(corpus.train_iter, corpus.val_iter, end_lr=10, num_iter=55, step_mode="exp")
_, suggested_lrs[model_name] = lr_finder.plot(skip_start=10, skip_end=0)
print(suggested_lrs)
pprint(suggested_lrs)
max_epochs = 50
histories = {}
for model_name in configs:
Expand All @@ -67,16 +68,15 @@
trainer = Trainer(
model=model,
data=corpus,
optimizer=Adam(model.parameters(), lr=3e-3, weight_decay=1e-2),
optimizer=Adam(model.parameters(), lr=suggested_lrs[model_name], weight_decay=1e-2),
device=use_device,
checkpoint_path=f"saved_states/{model_name}.pt"
)
histories[model_name] = trainer.train(max_epochs=max_epochs, no_improvement=1)
histories[model_name] = trainer.train(max_epochs=max_epochs, no_improvement=3)
print(f"Done Training: {model_name}")
print()
trainer.model.load_state(f"saved_states/{model_name}.pt")
sentence = "\"Menjatuhkan sanksi pemberhentian tetap kepada teradu Sophia Marlinda Djami selaku Ketua KPU Kabupaten Sumba Barat, sejak dibacakannya putusan ini\", ucap Alfitra dalam sidang putusan, Rabu (8/7/2020)."
words, infer_tags, unknown_tokens = trainer.infer(sentence=sentence)
print()
from pprint import pprint
pprint(histories)
1 change: 0 additions & 1 deletion nerindo/embeddings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

import os
from pprint import pprint
from typing import List

from spacy.lang.id import Indonesian
Expand Down
5 changes: 3 additions & 2 deletions nerindo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,9 @@ def train(self, max_epochs, no_improvement=None):
best_epoch = None
lr_scheduler = ReduceLROnPlateau(
optimizer=self.optimizer,
patience=2,
patience=3,
factor=0.3,
mode="max",
verbose=True
)
epoch = 1
Expand All @@ -131,7 +132,7 @@ def train(self, max_epochs, no_improvement=None):
history["train_loss"].append(train_loss)
history["train_f1"].append(train_f1)
val_loss, val_f1 = self.evaluate(self.data.val_iter)
lr_scheduler.step(val_loss)
lr_scheduler.step(val_f1)
# take the current model if it it at least 1% better than the previous best F1
if self.checkpoint_path and val_f1 > (best_val_f1 + 0.01 * best_val_f1):
print(f"Epoch-{epoch}: found better Val F1: {val_f1:.4f}, saving model...")
Expand Down
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
numpy==1.15.4
matplotlib
scikit-learn==0.20.1
gensim==3.8.0
spacy==2.0.16
torch==1.5.1
torchtext==0.6.0
pytorch-crf==0.7.2

0 comments on commit 0941b6a

Please sign in to comment.