Skip to content

Commit

Permalink
Add INQURE annotations to JointBERT (#209)
Browse files Browse the repository at this point in the history
  • Loading branch information
IKostric authored Sep 8, 2023
1 parent 4fec288 commit b0c7368
Show file tree
Hide file tree
Showing 6 changed files with 553 additions and 485 deletions.
933 changes: 482 additions & 451 deletions data/training/utterances.yaml

Large diffs are not rendered by default.

30 changes: 19 additions & 11 deletions moviebot/nlu/annotation/joint_bert/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,21 @@ def _build_dataset(self) -> None:
labels = labels + ([_IGNORE_INDEX] * padding_length)
self.examples.append((input_ids, attention_mask, intent, labels))

def _num_word_tokens(self, word: str) -> int:
"""Returns the number of word tokens in the input word.
def _num_inside_word_tokens(self, word: str) -> int:
"""Returns the number of inside word tokens in the input word.
I.e The number of non-starting tokens in the word.
Args:
word: The input word.
Returns:
The number of word tokens in the input word.
"""
return len(self.tokenizer.tokenize(word))
return len(self.tokenizer.tokenize(word)) - 1

def _tokenize_and_label(
self, intent: str, text: str, slot_annotations: Tuple(str, str)
self, intent: str, text: str, slot_annotations: Tuple[str, str]
) -> Tuple[int, List[str], List[int]]:
"""Tokenizes the text and assigns labels based on slot annotations.
Expand Down Expand Up @@ -153,20 +155,26 @@ def _tokenize_and_label(
index = text.find(slot_text)
for word in text[start_idx:index].split():
labels.append(JointBERTSlot.to_index("OUT"))
labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
labels.extend(
[_IGNORE_INDEX] * self._num_inside_word_tokens(word)
)

for i, word in enumerate(slot_text.split()):
labels.append(
JointBERTSlot.to_index(
("B_" if i == 0 else "I_") + slot_label.upper()
)
slot = (
("B" if i == 0 else "I")
+ ("_INQUIRE_" if intent == "INQUIRE" else "_PREFERENCE_")
+ slot_label.upper()
)

labels.append(JointBERTSlot.to_index(slot))
labels.extend(
[_IGNORE_INDEX] * self._num_inside_word_tokens(word)
)
labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
start_idx = index + len(slot_text)

for word in text[start_idx:].split():
labels.append(JointBERTSlot.to_index("OUT"))
labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
labels.extend([_IGNORE_INDEX] * self._num_inside_word_tokens(word))
assert len(tokens) == len(labels)
return JointBERTIntent.to_index(intent.upper()), tokens, labels

Expand Down
9 changes: 5 additions & 4 deletions moviebot/nlu/annotation/joint_bert/joint_bert_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
from transformers import get_linear_schedule_with_warmup

from moviebot.nlu.annotation.joint_bert import JointBERT
from moviebot.nlu.annotation.joint_bert.dataset import JointBERTDataset
from moviebot.nlu.annotation.joint_bert.dataset import (
_IGNORE_INDEX,
JointBERTDataset,
)
from moviebot.nlu.annotation.joint_bert.slot_mapping import (
JointBERTIntent,
JointBERTSlot,
Expand All @@ -22,9 +25,7 @@
Batch = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]

_MODEL_OUTPUT_PATH = "models/joint_bert"
_DATA_PATH = "data/training/nlu/utterances.yaml"

_IGNORE_INDEX = -100
_DATA_PATH = "data/training/utterances.yaml"


class JointBERTTrain(JointBERT, pl.LightningModule):
Expand Down
42 changes: 29 additions & 13 deletions moviebot/nlu/annotation/joint_bert/slot_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,34 @@ class JointBERTIntent(EnumWithMapping):

class JointBERTSlot(EnumWithMapping):
OUT = auto()
B_MODIFIER = auto()
I_MODIFIER = auto()
B_GENRES = auto()
I_GENRES = auto()
B_ACTOR = auto()
I_ACTOR = auto()
B_DIRECTOR = auto()
I_DIRECTOR = auto()
B_KEYWORD = auto()
I_KEYWORD = auto()
B_YEAR = auto()
I_YEAR = auto()
B_PREFERENCE_MODIFIER = auto()
I_PREFERENCE_MODIFIER = auto()
B_PREFERENCE_MODIFIER_YEAR = auto()
I_PREFERENCE_MODIFIER_YEAR = auto()
B_PREFERENCE_GENRES = auto()
I_PREFERENCE_GENRES = auto()
B_PREFERENCE_ACTORS = auto()
I_PREFERENCE_ACTORS = auto()
B_PREFERENCE_DIRECTORS = auto()
I_PREFERENCE_DIRECTORS = auto()
B_PREFERENCE_KEYWORDS = auto()
I_PREFERENCE_KEYWORDS = auto()
B_PREFERENCE_YEAR = auto()
I_PREFERENCE_YEAR = auto()
B_INQUIRE_GENRES = auto()
I_INQUIRE_GENRES = auto()
B_INQUIRE_RATING = auto()
I_INQUIRE_RATING = auto()
B_INQUIRE_DURATION = auto()
I_INQUIRE_DURATION = auto()
B_INQUIRE_PLOT = auto()
I_INQUIRE_PLOT = auto()
B_INQUIRE_ACTORS = auto()
I_INQUIRE_ACTORS = auto()
B_INQUIRE_DIRECTORS = auto()
I_INQUIRE_DIRECTORS = auto()
B_INQUIRE_YEAR = auto()
I_INQUIRE_YEAR = auto()

def is_start(self) -> bool:
"""Returns True if the slot is a starting point for a slot value."""
Expand All @@ -56,5 +72,5 @@ def is_inside(self) -> bool:
if __name__ == "__main__":
print(JointBERTIntent.to_index("REVEAL"))
print(JointBERTIntent.from_index(0))
print(JointBERTSlot.to_index("B_MODIFIER"))
print(JointBERTSlot.to_index("B_PREFERENCE_MODIFIER"))
print(JointBERTSlot.from_index(1))
21 changes: 16 additions & 5 deletions moviebot/nlu/neural_nlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,22 +61,32 @@ def generate_dacts(
return selected_option

intent, slots = self.annotate_utterance(user_utterance)
intent = UserIntents[intent]

constraints = []
operator = None
for slot in slots:
if slot["slot"] == "MODIFIER":
if "MODIFIER" in slot["slot"]:
operator = self.get_constraint_operator(slot["value"])
continue

if intent == UserIntents.INQUIRE and "INQUIRE" not in slot["slot"]:
continue
if (
intent in (UserIntents.REMOVE_PREFERENCE, UserIntents.INQUIRE)
and "PREFERENCE" not in slot["slot"]
):
continue
slot_name = Slots[slot["slot"].split("_")[-1]].value
constraints.append(
ItemConstraint(
Slots[slot["slot"]].value,
slot_name,
op=operator or Operator.EQ,
value=slot["value"],
)
)

return [DialogueAct(UserIntents[intent], constraints)]
return [DialogueAct(intent, constraints)]

def annotate_utterance(
self, user_utterance: UserUtterance
Expand All @@ -100,6 +110,7 @@ def annotate_utterance(
return_tensors="pt",
)
intent_idx, slot_idxs = self._model.predict(encoding["input_ids"])
intent = JointBERTIntent.from_index(intent_idx).name

# [1:-1] to remove [CLS] and [SEP] tokens
offset_mapping = encoding["offset_mapping"][0, 1:-1][mask].tolist()
Expand Down Expand Up @@ -134,7 +145,7 @@ def annotate_utterance(
}
)

return JointBERTIntent.from_index(intent_idx).name, slots_info
return intent, slots_info

def get_constraint_operator(self, text: str) -> Operator:
"""Gets the operator based on the text. Only supports negation for now.
Expand All @@ -150,7 +161,7 @@ def get_constraint_operator(self, text: str) -> Operator:


if __name__ == "__main__":
nlu = NeuralNLU({})
nlu = NeuralNLU(None)

class DS:
item_in_focus = None
Expand Down
3 changes: 2 additions & 1 deletion moviebot/nlu/nlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def __init__(self, config: Dict[str, Any]) -> None:
config: Paths to ontology, database and tag words for slots in NLU.
"""
self.config = config
self.intents_checker = UserIntentsChecker(config)
if config:
self.intents_checker = UserIntentsChecker(config)

@abstractmethod
def generate_dacts(
Expand Down

0 comments on commit b0c7368

Please sign in to comment.