Add INQURE annotations to JointBERT (#209)

iai-group · Sep 8, 2023 · b0c7368 · b0c7368
1 parent 4fec288
commit b0c7368
Show file tree

Hide file tree

Showing 6 changed files with 553 additions and 485 deletions.
diff --git a/data/training/utterances.yaml b/data/training/utterances.yaml
diff --git a/moviebot/nlu/annotation/joint_bert/dataset.py b/moviebot/nlu/annotation/joint_bert/dataset.py
@@ -106,19 +106,21 @@ def _build_dataset(self) -> None:
             labels = labels + ([_IGNORE_INDEX] * padding_length)
             self.examples.append((input_ids, attention_mask, intent, labels))
 
-    def _num_word_tokens(self, word: str) -> int:
-        """Returns the number of word tokens in the input word.
+    def _num_inside_word_tokens(self, word: str) -> int:
+        """Returns the number of inside word tokens in the input word.
+
+        I.e The number of non-starting tokens in the word.
 
         Args:
             word: The input word.
 
         Returns:
             The number of word tokens in the input word.
         """
-        return len(self.tokenizer.tokenize(word))
+        return len(self.tokenizer.tokenize(word)) - 1
 
     def _tokenize_and_label(
-        self, intent: str, text: str, slot_annotations: Tuple(str, str)
+        self, intent: str, text: str, slot_annotations: Tuple[str, str]
     ) -> Tuple[int, List[str], List[int]]:
         """Tokenizes the text and assigns labels based on slot annotations.
 
@@ -153,20 +155,26 @@ def _tokenize_and_label(
             index = text.find(slot_text)
             for word in text[start_idx:index].split():
                 labels.append(JointBERTSlot.to_index("OUT"))
-                labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
+                labels.extend(
+                    [_IGNORE_INDEX] * self._num_inside_word_tokens(word)
+                )
 
             for i, word in enumerate(slot_text.split()):
-                labels.append(
-                    JointBERTSlot.to_index(
-                        ("B_" if i == 0 else "I_") + slot_label.upper()
-                    )
+                slot = (
+                    ("B" if i == 0 else "I")
+                    + ("_INQUIRE_" if intent == "INQUIRE" else "_PREFERENCE_")
+                    + slot_label.upper()
+                )
+
+                labels.append(JointBERTSlot.to_index(slot))
+                labels.extend(
+                    [_IGNORE_INDEX] * self._num_inside_word_tokens(word)
                 )
-                labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
             start_idx = index + len(slot_text)
 
         for word in text[start_idx:].split():
             labels.append(JointBERTSlot.to_index("OUT"))
-            labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
+            labels.extend([_IGNORE_INDEX] * self._num_inside_word_tokens(word))
         assert len(tokens) == len(labels)
         return JointBERTIntent.to_index(intent.upper()), tokens, labels
 

diff --git a/moviebot/nlu/annotation/joint_bert/joint_bert_train.py b/moviebot/nlu/annotation/joint_bert/joint_bert_train.py
@@ -13,7 +13,10 @@
 from transformers import get_linear_schedule_with_warmup
 
 from moviebot.nlu.annotation.joint_bert import JointBERT
-from moviebot.nlu.annotation.joint_bert.dataset import JointBERTDataset
+from moviebot.nlu.annotation.joint_bert.dataset import (
+    _IGNORE_INDEX,
+    JointBERTDataset,
+)
 from moviebot.nlu.annotation.joint_bert.slot_mapping import (
     JointBERTIntent,
     JointBERTSlot,
@@ -22,9 +25,7 @@
 Batch = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
 
 _MODEL_OUTPUT_PATH = "models/joint_bert"
-_DATA_PATH = "data/training/nlu/utterances.yaml"
-
-_IGNORE_INDEX = -100
+_DATA_PATH = "data/training/utterances.yaml"
 
 
 class JointBERTTrain(JointBERT, pl.LightningModule):

diff --git a/moviebot/nlu/annotation/joint_bert/slot_mapping.py b/moviebot/nlu/annotation/joint_bert/slot_mapping.py
@@ -31,18 +31,34 @@ class JointBERTIntent(EnumWithMapping):
 
 class JointBERTSlot(EnumWithMapping):
     OUT = auto()
-    B_MODIFIER = auto()
-    I_MODIFIER = auto()
-    B_GENRES = auto()
-    I_GENRES = auto()
-    B_ACTOR = auto()
-    I_ACTOR = auto()
-    B_DIRECTOR = auto()
-    I_DIRECTOR = auto()
-    B_KEYWORD = auto()
-    I_KEYWORD = auto()
-    B_YEAR = auto()
-    I_YEAR = auto()
+    B_PREFERENCE_MODIFIER = auto()
+    I_PREFERENCE_MODIFIER = auto()
+    B_PREFERENCE_MODIFIER_YEAR = auto()
+    I_PREFERENCE_MODIFIER_YEAR = auto()
+    B_PREFERENCE_GENRES = auto()
+    I_PREFERENCE_GENRES = auto()
+    B_PREFERENCE_ACTORS = auto()
+    I_PREFERENCE_ACTORS = auto()
+    B_PREFERENCE_DIRECTORS = auto()
+    I_PREFERENCE_DIRECTORS = auto()
+    B_PREFERENCE_KEYWORDS = auto()
+    I_PREFERENCE_KEYWORDS = auto()
+    B_PREFERENCE_YEAR = auto()
+    I_PREFERENCE_YEAR = auto()
+    B_INQUIRE_GENRES = auto()
+    I_INQUIRE_GENRES = auto()
+    B_INQUIRE_RATING = auto()
+    I_INQUIRE_RATING = auto()
+    B_INQUIRE_DURATION = auto()
+    I_INQUIRE_DURATION = auto()
+    B_INQUIRE_PLOT = auto()
+    I_INQUIRE_PLOT = auto()
+    B_INQUIRE_ACTORS = auto()
+    I_INQUIRE_ACTORS = auto()
+    B_INQUIRE_DIRECTORS = auto()
+    I_INQUIRE_DIRECTORS = auto()
+    B_INQUIRE_YEAR = auto()
+    I_INQUIRE_YEAR = auto()
 
     def is_start(self) -> bool:
         """Returns True if the slot is a starting point for a slot value."""
@@ -56,5 +72,5 @@ def is_inside(self) -> bool:
 if __name__ == "__main__":
     print(JointBERTIntent.to_index("REVEAL"))
     print(JointBERTIntent.from_index(0))
-    print(JointBERTSlot.to_index("B_MODIFIER"))
+    print(JointBERTSlot.to_index("B_PREFERENCE_MODIFIER"))
     print(JointBERTSlot.from_index(1))
diff --git a/moviebot/nlu/neural_nlu.py b/moviebot/nlu/neural_nlu.py
@@ -61,22 +61,32 @@ def generate_dacts(
             return selected_option
 
         intent, slots = self.annotate_utterance(user_utterance)
+        intent = UserIntents[intent]
 
         constraints = []
         operator = None
         for slot in slots:
-            if slot["slot"] == "MODIFIER":
+            if "MODIFIER" in slot["slot"]:
                 operator = self.get_constraint_operator(slot["value"])
                 continue
+
+            if intent == UserIntents.INQUIRE and "INQUIRE" not in slot["slot"]:
+                continue
+            if (
+                intent in (UserIntents.REMOVE_PREFERENCE, UserIntents.INQUIRE)
+                and "PREFERENCE" not in slot["slot"]
+            ):
+                continue
+            slot_name = Slots[slot["slot"].split("_")[-1]].value
             constraints.append(
                 ItemConstraint(
-                    Slots[slot["slot"]].value,
+                    slot_name,
                     op=operator or Operator.EQ,
                     value=slot["value"],
                 )
             )
 
-        return [DialogueAct(UserIntents[intent], constraints)]
+        return [DialogueAct(intent, constraints)]
 
     def annotate_utterance(
         self, user_utterance: UserUtterance
@@ -100,6 +110,7 @@ def annotate_utterance(
             return_tensors="pt",
         )
         intent_idx, slot_idxs = self._model.predict(encoding["input_ids"])
+        intent = JointBERTIntent.from_index(intent_idx).name
 
         # [1:-1] to remove [CLS] and [SEP] tokens
         offset_mapping = encoding["offset_mapping"][0, 1:-1][mask].tolist()
@@ -134,7 +145,7 @@ def annotate_utterance(
                 }
             )
 
-        return JointBERTIntent.from_index(intent_idx).name, slots_info
+        return intent, slots_info
 
     def get_constraint_operator(self, text: str) -> Operator:
         """Gets the operator based on the text. Only supports negation for now.
@@ -150,7 +161,7 @@ def get_constraint_operator(self, text: str) -> Operator:
 
 
 if __name__ == "__main__":
-    nlu = NeuralNLU({})
+    nlu = NeuralNLU(None)
 
     class DS:
         item_in_focus = None

diff --git a/moviebot/nlu/nlu.py b/moviebot/nlu/nlu.py
@@ -28,7 +28,8 @@ def __init__(self, config: Dict[str, Any]) -> None:
             config: Paths to ontology, database and tag words for slots in NLU.
         """
         self.config = config
-        self.intents_checker = UserIntentsChecker(config)
+        if config:
+            self.intents_checker = UserIntentsChecker(config)
 
     @abstractmethod
     def generate_dacts(