Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add INQURE annotations to JointBERT #209

Merged
merged 3 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
933 changes: 482 additions & 451 deletions data/training/utterances.yaml

Large diffs are not rendered by default.

30 changes: 19 additions & 11 deletions moviebot/nlu/annotation/joint_bert/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,21 @@ def _build_dataset(self) -> None:
labels = labels + ([_IGNORE_INDEX] * padding_length)
self.examples.append((input_ids, attention_mask, intent, labels))

def _num_word_tokens(self, word: str) -> int:
"""Returns the number of word tokens in the input word.
def _num_inside_word_tokens(self, word: str) -> int:
"""Returns the number of inside word tokens in the input word.

I.e The number of non-starting tokens in the word.

Args:
word: The input word.

Returns:
The number of word tokens in the input word.
"""
return len(self.tokenizer.tokenize(word))
return len(self.tokenizer.tokenize(word)) - 1

def _tokenize_and_label(
self, intent: str, text: str, slot_annotations: Tuple(str, str)
self, intent: str, text: str, slot_annotations: Tuple[str, str]
) -> Tuple[int, List[str], List[int]]:
"""Tokenizes the text and assigns labels based on slot annotations.

Expand Down Expand Up @@ -153,20 +155,26 @@ def _tokenize_and_label(
index = text.find(slot_text)
for word in text[start_idx:index].split():
labels.append(JointBERTSlot.to_index("OUT"))
labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
labels.extend(
[_IGNORE_INDEX] * self._num_inside_word_tokens(word)
)

for i, word in enumerate(slot_text.split()):
labels.append(
JointBERTSlot.to_index(
("B_" if i == 0 else "I_") + slot_label.upper()
)
slot = (
("B" if i == 0 else "I")
+ ("_INQUIRE_" if intent == "INQUIRE" else "_PREFERENCE_")
+ slot_label.upper()
)

labels.append(JointBERTSlot.to_index(slot))
labels.extend(
[_IGNORE_INDEX] * self._num_inside_word_tokens(word)
)
labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
start_idx = index + len(slot_text)

for word in text[start_idx:].split():
labels.append(JointBERTSlot.to_index("OUT"))
labels.extend([_IGNORE_INDEX] * self._num_word_tokens(word) - 1)
labels.extend([_IGNORE_INDEX] * self._num_inside_word_tokens(word))
NoB0 marked this conversation as resolved.
Show resolved Hide resolved
assert len(tokens) == len(labels)
return JointBERTIntent.to_index(intent.upper()), tokens, labels

Expand Down
9 changes: 5 additions & 4 deletions moviebot/nlu/annotation/joint_bert/joint_bert_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
from transformers import get_linear_schedule_with_warmup

from moviebot.nlu.annotation.joint_bert import JointBERT
from moviebot.nlu.annotation.joint_bert.dataset import JointBERTDataset
from moviebot.nlu.annotation.joint_bert.dataset import (
_IGNORE_INDEX,
JointBERTDataset,
)
from moviebot.nlu.annotation.joint_bert.slot_mapping import (
JointBERTIntent,
JointBERTSlot,
Expand All @@ -22,9 +25,7 @@
Batch = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]

_MODEL_OUTPUT_PATH = "models/joint_bert"
_DATA_PATH = "data/training/nlu/utterances.yaml"

_IGNORE_INDEX = -100
_DATA_PATH = "data/training/utterances.yaml"


class JointBERTTrain(JointBERT, pl.LightningModule):
Expand Down
42 changes: 29 additions & 13 deletions moviebot/nlu/annotation/joint_bert/slot_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,34 @@ class JointBERTIntent(EnumWithMapping):

class JointBERTSlot(EnumWithMapping):
OUT = auto()
B_MODIFIER = auto()
I_MODIFIER = auto()
B_GENRES = auto()
I_GENRES = auto()
B_ACTOR = auto()
I_ACTOR = auto()
B_DIRECTOR = auto()
I_DIRECTOR = auto()
B_KEYWORD = auto()
I_KEYWORD = auto()
B_YEAR = auto()
I_YEAR = auto()
B_PREFERENCE_MODIFIER = auto()
I_PREFERENCE_MODIFIER = auto()
B_PREFERENCE_MODIFIER_YEAR = auto()
I_PREFERENCE_MODIFIER_YEAR = auto()
B_PREFERENCE_GENRES = auto()
I_PREFERENCE_GENRES = auto()
B_PREFERENCE_ACTORS = auto()
I_PREFERENCE_ACTORS = auto()
B_PREFERENCE_DIRECTORS = auto()
I_PREFERENCE_DIRECTORS = auto()
B_PREFERENCE_KEYWORDS = auto()
I_PREFERENCE_KEYWORDS = auto()
B_PREFERENCE_YEAR = auto()
I_PREFERENCE_YEAR = auto()
B_INQUIRE_GENRES = auto()
I_INQUIRE_GENRES = auto()
B_INQUIRE_RATING = auto()
I_INQUIRE_RATING = auto()
B_INQUIRE_DURATION = auto()
I_INQUIRE_DURATION = auto()
B_INQUIRE_PLOT = auto()
I_INQUIRE_PLOT = auto()
B_INQUIRE_ACTORS = auto()
I_INQUIRE_ACTORS = auto()
B_INQUIRE_DIRECTORS = auto()
I_INQUIRE_DIRECTORS = auto()
B_INQUIRE_YEAR = auto()
I_INQUIRE_YEAR = auto()

def is_start(self) -> bool:
"""Returns True if the slot is a starting point for a slot value."""
Expand All @@ -56,5 +72,5 @@ def is_inside(self) -> bool:
if __name__ == "__main__":
print(JointBERTIntent.to_index("REVEAL"))
print(JointBERTIntent.from_index(0))
print(JointBERTSlot.to_index("B_MODIFIER"))
print(JointBERTSlot.to_index("B_PREFERENCE_MODIFIER"))
print(JointBERTSlot.from_index(1))
21 changes: 16 additions & 5 deletions moviebot/nlu/neural_nlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,22 +61,32 @@ def generate_dacts(
return selected_option

intent, slots = self.annotate_utterance(user_utterance)
intent = UserIntents[intent]

constraints = []
operator = None
for slot in slots:
if slot["slot"] == "MODIFIER":
if "MODIFIER" in slot["slot"]:
operator = self.get_constraint_operator(slot["value"])
continue

if intent == UserIntents.INQUIRE and "INQUIRE" not in slot["slot"]:
continue
if (
intent in (UserIntents.REMOVE_PREFERENCE, UserIntents.INQUIRE)
and "PREFERENCE" not in slot["slot"]
):
continue
slot_name = Slots[slot["slot"].split("_")[-1]].value
constraints.append(
ItemConstraint(
Slots[slot["slot"]].value,
slot_name,
op=operator or Operator.EQ,
value=slot["value"],
)
)

return [DialogueAct(UserIntents[intent], constraints)]
return [DialogueAct(intent, constraints)]

def annotate_utterance(
self, user_utterance: UserUtterance
Expand All @@ -100,6 +110,7 @@ def annotate_utterance(
return_tensors="pt",
)
intent_idx, slot_idxs = self._model.predict(encoding["input_ids"])
intent = JointBERTIntent.from_index(intent_idx).name

# [1:-1] to remove [CLS] and [SEP] tokens
offset_mapping = encoding["offset_mapping"][0, 1:-1][mask].tolist()
Expand Down Expand Up @@ -134,7 +145,7 @@ def annotate_utterance(
}
)

return JointBERTIntent.from_index(intent_idx).name, slots_info
return intent, slots_info

def get_constraint_operator(self, text: str) -> Operator:
"""Gets the operator based on the text. Only supports negation for now.
Expand All @@ -150,7 +161,7 @@ def get_constraint_operator(self, text: str) -> Operator:


if __name__ == "__main__":
nlu = NeuralNLU({})
nlu = NeuralNLU(None)

class DS:
item_in_focus = None
Expand Down
3 changes: 2 additions & 1 deletion moviebot/nlu/nlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def __init__(self, config: Dict[str, Any]) -> None:
config: Paths to ontology, database and tag words for slots in NLU.
"""
self.config = config
self.intents_checker = UserIntentsChecker(config)
if config:
self.intents_checker = UserIntentsChecker(config)

@abstractmethod
def generate_dacts(
Expand Down
Loading