-
Notifications
You must be signed in to change notification settings - Fork 196
Quickstart ~ Fine Tune
Genta Indra Winata edited this page Oct 3, 2020
·
1 revision
A guide to load IndoBERT pretrained model and finetune the model.
There are 3 main tasks the pretrained model support for the 12 Downstream Tasks, which are:
-
SequenceClassification
Follow below example to use it
from transformers import BertForSequenceClassification
-
MultiLabelClassification
Supported models:
BertForMultiLabelClassification
AlbertForMultiLabelClassification
XLMForMultiLabelClassification
XLMRobertaForMultiLabelClassification
Follow below example to use it
from modules.multi_label_classification import BertForMultiLabelClassification
-
WordClassification
Supported models:
BertForWordClassification
AlbertForWordClassification
XLMForWordClassification
XLMRobertaForWordClassification
Follow below example to use it
from modules.word_classification import BertForWordClassification
You can check [Link] for the notebook examples.
The examples consist of:
-
finetune_smsa.ipynb for
BertForSequenceClassification
-
finetune_casa.ipynb for
BertForMultiLabelClassification
-
finetune_ner_grit.ipynb for
BertForWordClassification
In summary, the main steps are:
-
Load IndoBERT Model
Example
# Load Tokenizer and Config tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1') config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1') config.num_labels = DocumentSentimentDataset.NUM_LABELS # Instantiate model model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)
-
Prepare Specific Task Dataset
Example
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True) valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True) test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True) train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True) valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False) test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False) w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL print(w2i) print(i2w)
-
Fine Tuning & Evaluation
Example
optimizer = optim.Adam(model.parameters(), lr=1e-6) model = model.cuda() # Train n_epochs = 4 for epoch in range(n_epochs): model.train() torch.set_grad_enabled(True) total_train_loss = 0 list_hyp, list_label = [], [] train_pbar = tqdm(train_loader, leave=True, total=len(train_loader)) for i, batch_data in enumerate(train_pbar): # Forward model loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda') # Update model optimizer.zero_grad() loss.backward() optimizer.step() tr_loss = loss.item() total_train_loss = total_train_loss + tr_loss # Calculate metrics list_hyp += batch_hyp list_label += batch_label train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1), total_train_loss/(i+1), get_lr(optimizer))) # Calculate train metric metrics = document_sentiment_metrics_fn(list_hyp, list_label) print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1), total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer))) # Evaluate on validation model.eval() torch.set_grad_enabled(False) total_loss, total_correct, total_labels = 0, 0, 0 list_hyp, list_label = [], [] pbar = tqdm(valid_loader, leave=True, total=len(valid_loader)) for i, batch_data in enumerate(pbar): batch_seq = batch_data[-1] loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda') # Calculate total loss valid_loss = loss.item() total_loss = total_loss + valid_loss # Calculate evaluation metrics list_hyp += batch_hyp list_label += batch_label metrics = document_sentiment_metrics_fn(list_hyp, list_label) pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics))) metrics = document_sentiment_metrics_fn(list_hyp, list_label) print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1), total_loss/(i+1), metrics_to_string(metrics)))
-
Evaluate on test
Example
model.eval() torch.set_grad_enabled(False) total_loss, total_correct, total_labels = 0, 0, 0 list_hyp, list_label = [], [] pbar = tqdm(test_loader, leave=True, total=len(test_loader)) for i, batch_data in enumerate(pbar): _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda') list_hyp += batch_hyp # Save prediction df = pd.DataFrame({'label':list_hyp}).reset_index() df.to_csv('pred.txt', index=False) print(df)
© 2020