diff --git a/.gitignore b/.gitignore index 68eff1b..13380e6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -.env +.env* .idea .DS_store references diff --git a/README.md b/README.md index 366ddf4..2714b4b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,32 @@ -## nerindo +## Nerindo -Named Entity Recognition for Indonesian NER with PyTorch. +Named Entity Recognition for Bahasa Indonesia NER with PyTorch. Corpus for NER: * https://github.com/yohanesgultom/nlp-experiments -* https://github.com/yusufsyaifudin/indonesia-ner \ No newline at end of file +* https://github.com/yusufsyaifudin/indonesia-ner + +The step-by-step implementation in Google Colab is indexed [here](https://medium.com/@yoseflaw/step-by-step-ner-model-for-bahasa-indonesia-with-pytorch-and-torchtext-6f94fca08406?source=friends_link&sk=c15c89082c00c8785577e1cebb77c9c2). + +The Fine-tuned Indonesian word embeddings `id_ft.bin` is available [here](https://drive.google.com/file/d/1BGWnSHGZXdPfVCCkvx3_ZbjNnKh2t9pF/view?usp=sharing), based on word embeddings trained in [indonesian-word-embedding](https://github.com/galuhsahid/indonesian-word-embedding). + +### Included configurations +1. BiLSTM +2. BiLSTM + Word Embeddings +3. BiLSTM + Word Embeddings + Char Embeddings (CNN) +4. BiLSTM + Word Embeddings + Char Embeddings (CNN) + Attention Layer +5. Transformer (simplified BERT) + Word Embeddings + Char Embeddings (CNN) + +Automatic learning rate finder based on [pytorch-lr-finder](https://github.com/davidtvs/pytorch-lr-finder). + +### Example output for learning rate finder + +LR Finder Example Output + +### Final result + +LR Finder Example Output + +### Main reference + +Gunawan, W., Suhartono, D., Purnomo, F., & Ongko, A. (2018). Named-entity recognition for indonesian language using bidirectional lstm-cnns. Procedia Computer Science, 135, 425-432. diff --git a/images/final_result.png b/images/final_result.png new file mode 100644 index 0000000..445c123 Binary files /dev/null and b/images/final_result.png differ diff --git a/images/lr_finder.png b/images/lr_finder.png new file mode 100644 index 0000000..26c0884 Binary files /dev/null and b/images/lr_finder.png differ diff --git a/main.py b/main.py index 36adbe5..0295b94 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ from nerindo.models import NERModel from nerindo.lr_finder import LRFinder from nerindo.trainer import Trainer +from pprint import pprint if __name__ == "__main__": use_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -58,7 +59,7 @@ lr_finder = LRFinder(model, Adam(model.parameters(), lr=1e-4, weight_decay=1e-2), device=use_device) lr_finder.range_test(corpus.train_iter, corpus.val_iter, end_lr=10, num_iter=55, step_mode="exp") _, suggested_lrs[model_name] = lr_finder.plot(skip_start=10, skip_end=0) - print(suggested_lrs) + pprint(suggested_lrs) max_epochs = 50 histories = {} for model_name in configs: @@ -67,16 +68,15 @@ trainer = Trainer( model=model, data=corpus, - optimizer=Adam(model.parameters(), lr=3e-3, weight_decay=1e-2), + optimizer=Adam(model.parameters(), lr=suggested_lrs[model_name], weight_decay=1e-2), device=use_device, checkpoint_path=f"saved_states/{model_name}.pt" ) - histories[model_name] = trainer.train(max_epochs=max_epochs, no_improvement=1) + histories[model_name] = trainer.train(max_epochs=max_epochs, no_improvement=3) print(f"Done Training: {model_name}") print() trainer.model.load_state(f"saved_states/{model_name}.pt") sentence = "\"Menjatuhkan sanksi pemberhentian tetap kepada teradu Sophia Marlinda Djami selaku Ketua KPU Kabupaten Sumba Barat, sejak dibacakannya putusan ini\", ucap Alfitra dalam sidang putusan, Rabu (8/7/2020)." words, infer_tags, unknown_tokens = trainer.infer(sentence=sentence) print() - from pprint import pprint pprint(histories) diff --git a/nerindo/embeddings.py b/nerindo/embeddings.py index 2d24895..30d007a 100644 --- a/nerindo/embeddings.py +++ b/nerindo/embeddings.py @@ -1,6 +1,5 @@ import os -from pprint import pprint from typing import List from spacy.lang.id import Indonesian diff --git a/nerindo/trainer.py b/nerindo/trainer.py index 86e90ec..b299d6c 100644 --- a/nerindo/trainer.py +++ b/nerindo/trainer.py @@ -116,8 +116,9 @@ def train(self, max_epochs, no_improvement=None): best_epoch = None lr_scheduler = ReduceLROnPlateau( optimizer=self.optimizer, - patience=2, + patience=3, factor=0.3, + mode="max", verbose=True ) epoch = 1 @@ -131,7 +132,7 @@ def train(self, max_epochs, no_improvement=None): history["train_loss"].append(train_loss) history["train_f1"].append(train_f1) val_loss, val_f1 = self.evaluate(self.data.val_iter) - lr_scheduler.step(val_loss) + lr_scheduler.step(val_f1) # take the current model if it it at least 1% better than the previous best F1 if self.checkpoint_path and val_f1 > (best_val_f1 + 0.01 * best_val_f1): print(f"Epoch-{epoch}: found better Val F1: {val_f1:.4f}, saving model...") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b34b408 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +numpy==1.15.4 +matplotlib +scikit-learn==0.20.1 +gensim==3.8.0 +spacy==2.0.16 +torch==1.5.1 +torchtext==0.6.0 +pytorch-crf==0.7.2 \ No newline at end of file