diff --git a/moviebot/dialogue_manager/dialogue_manager.py b/moviebot/dialogue_manager/dialogue_manager.py index fdc1d17..e30d059 100644 --- a/moviebot/dialogue_manager/dialogue_manager.py +++ b/moviebot/dialogue_manager/dialogue_manager.py @@ -11,7 +11,9 @@ from moviebot.core.intents.agent_intents import AgentIntents from moviebot.dialogue_manager.dialogue_act import DialogueAct -from moviebot.dialogue_manager.dialogue_policy import DialoguePolicy +from moviebot.dialogue_manager.dialogue_policy.rb_dialogue_policy import ( + RuleBasedDialoguePolicy, +) from moviebot.dialogue_manager.dialogue_state import DialogueState from moviebot.dialogue_manager.dialogue_state_tracker import ( DialogueStateTracker, @@ -36,7 +38,9 @@ def __init__( self.isBot = isBot self.new_user = new_user self.dialogue_state_tracker = DialogueStateTracker(config, self.isBot) - self.dialogue_policy = DialoguePolicy(self.isBot, self.new_user) + self.dialogue_policy = RuleBasedDialoguePolicy( + self.isBot, self.new_user + ) self.recommender: RecommenderModel = config.get("recommender") def start_dialogue(self, new_user: bool = False) -> List[DialogueAct]: diff --git a/moviebot/dialogue_manager/dialogue_policy/__init__.py b/moviebot/dialogue_manager/dialogue_policy/__init__.py new file mode 100644 index 0000000..e1a9070 --- /dev/null +++ b/moviebot/dialogue_manager/dialogue_policy/__init__.py @@ -0,0 +1,19 @@ +from moviebot.dialogue_manager.dialogue_policy.a2c_dialogue_policy import ( + A2CDialoguePolicy, +) +from moviebot.dialogue_manager.dialogue_policy.dqn_dialogue_policy import ( + DQNDialoguePolicy, +) +from moviebot.dialogue_manager.dialogue_policy.neural_dialogue_policy import ( + NeuralDialoguePolicy, +) +from moviebot.dialogue_manager.dialogue_policy.rb_dialogue_policy import ( + RuleBasedDialoguePolicy, +) + +__all__ = [ + "A2CDialoguePolicy", + "DQNDialoguePolicy", + "NeuralDialoguePolicy", + "RuleBasedDialoguePolicy", +] diff --git a/moviebot/dialogue_manager/dialogue_policy/a2c_dialogue_policy.py b/moviebot/dialogue_manager/dialogue_policy/a2c_dialogue_policy.py new file mode 100644 index 0000000..6681f46 --- /dev/null +++ b/moviebot/dialogue_manager/dialogue_policy/a2c_dialogue_policy.py @@ -0,0 +1,206 @@ +"""Deep dialogue policy based on advantage actor-critic.""" +from __future__ import annotations + +from typing import Any, List, Optional, Tuple + +import torch + +from moviebot.dialogue_manager.dialogue_policy.neural_dialogue_policy import ( + NeuralDialoguePolicy, +) + + +class A2CDialoguePolicy(NeuralDialoguePolicy): + def __init__( + self, + input_size: int, + hidden_size: int, + output_size: int, + possible_actions: List[Any], + num_timesteps: Optional[int] = None, + n_envs: int = 1, + ) -> None: + """Initializes the policy. + + Args: + input_size: The size of the input vector. + hidden_size: The size of the hidden layer. + output_size: The size of the output vector. + possible_actions: The list of possible actions. + num_timesteps: The number of timesteps. Defaults to None. + n_envs: The number of environments. Defaults to 1. + """ + super().__init__(input_size, hidden_size, output_size, possible_actions) + + self.n_envs = n_envs + + self.actor = torch.nn.Sequential( + torch.nn.Linear(input_size, hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(hidden_size, output_size), + ) + + self.critic = torch.nn.Sequential( + torch.nn.Linear(input_size, hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(hidden_size, 1), + ) + + self.actor_optimizer = torch.optim.Adam( + self.actor.parameters(), lr=0.001 + ) + self.actor_lr_scheduler = None + self.critic_optimizer = torch.optim.Adam( + self.critic.parameters(), lr=0.005 + ) + self.critic_lr_scheduler = None + + if num_timesteps is not None: + self.actor_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + self.actor_optimizer, total_iters=num_timesteps + ) + self.critic_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + self.critic_optimizer, total_iters=num_timesteps + ) + + def forward(self, state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Forward pass. + + Args: + state: A batched vector of dialogue states. + + Returns: + The output of the actor and the critic. + """ + state_values = self.critic(state) + actions_log_probs = self.actor(state) + return state_values, actions_log_probs + + def select_action( + self, state: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Returns the selected action and its log probability. + + Args: + state: Representation of dialogue state as a vector. + + Returns: + The selected action id, its log probability, the state value, and + the entropy. + """ + state_value, actions_log_prob = self.forward(state) + actions_distribution = torch.distributions.Categorical( + logits=actions_log_prob + ) + action = actions_distribution.sample() + actions_log_prob = actions_distribution.log_prob(action) + entropy = actions_distribution.entropy() + return action, actions_log_prob, state_value, entropy + + def get_losses( + self, + rewards: torch.Tensor, + action_log_probs: torch.Tensor, + value_preds: torch.Tensor, + entropy: torch.Tensor, + mask: torch.Tensor, + gamma: float = 0.99, + lam: float = 0.95, + entropy_coef: float = 0.01, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Computes the loss of a minibatch using the generalized advantage + estimator. + + Args: + rewards: The rewards. + action_log_probs: The log probabilities of the actions. + value_preds: The predicted values. + entropy: The entropy. + mask: The mask. + gamma: The discount factor. Defaults to 0.99. + lam: The GAE parameter (1 for Monte-Carlo sampling, 0 for normal + TD-learning). Defaults to 0.95. + entropy_coef: The entropy coefficient. Defaults to 0.01. + + Returns: + The critic and actor losses for the minibatch. + """ + T = len(rewards) + advantages = torch.zeros(T, self.n_envs) + + # Compute advantages with GAE + gae = 0.0 + for t in reversed(range(T - 1)): + td_error = ( + rewards[t] + + gamma * mask[t] * value_preds[t + 1] + - value_preds[t] + ) + gae = td_error + gamma * lam * mask[t] * gae + advantages[t] = gae + + # Compute losses + critic_loss = advantages.pow(2).mean() + actor_loss = ( + -(advantages.detach() * action_log_probs).mean() + - entropy_coef * entropy.mean() + ) + return critic_loss, actor_loss + + def update_parameters( + self, critic_loss: torch.Tensor, actor_loss: torch.Tensor + ) -> None: + """Updates the parameters of the policy. + + Args: + critic_loss: The critic loss. + actor_loss: The actor loss. + """ + self.critic_optimizer.zero_grad() + critic_loss.backward() + self.critic_optimizer.step() + if self.critic_lr_scheduler is not None: + self.critic_lr_scheduler.step() + + self.actor_optimizer.zero_grad() + actor_loss.backward() + self.actor_optimizer.step() + if self.actor_lr_scheduler is not None: + self.actor_lr_scheduler.step() + + def save_policy(self, path: str) -> None: + """Saves the policy. + + Args: + path: The path to save the policy to. + """ + state_dict = { + "actor": self.actor.state_dict(), + "critic": self.critic.state_dict(), + "input_size": self.input_size, + "hidden_size": self.hidden_size, + "output_size": self.output_size, + "possible_actions": self.possible_actions, + } + torch.save(state_dict, path) + + @classmethod + def load_policy(cls, path: str) -> A2CDialoguePolicy: + """Loads the policy. + + Args: + path: The path to load the policy from. + + Returns: + The loaded policy. + """ + state_dict = torch.load(path) + policy = cls( + state_dict["input_size"], + state_dict["hidden_size"], + state_dict["output_size"], + state_dict["possible_actions"], + ) + policy.actor.load_state_dict(state_dict["actor"]) + policy.critic.load_state_dict(state_dict["critic"]) + return policy diff --git a/moviebot/dialogue_manager/dialogue_policy/dqn_dialogue_policy.py b/moviebot/dialogue_manager/dialogue_policy/dqn_dialogue_policy.py new file mode 100644 index 0000000..2ea204c --- /dev/null +++ b/moviebot/dialogue_manager/dialogue_policy/dqn_dialogue_policy.py @@ -0,0 +1,98 @@ +"""Deep dialogue policy based on Q network.""" + +from __future__ import annotations + +from typing import Any, List, Tuple + +import torch + +from moviebot.dialogue_manager.dialogue_policy.neural_dialogue_policy import ( + NeuralDialoguePolicy, +) + + +class DQNDialoguePolicy(NeuralDialoguePolicy): + def __init__( + self, + input_size: int, + hidden_size: int, + output_size: int, + possible_actions: List[Any], + ) -> None: + """Initializes the policy. + + Args: + input_size: The size of the input vector. + hidden_size: The size of the hidden layer. + output_size: The size of the output vector. + possible_actions: The list of possible actions. + """ + super().__init__(input_size, hidden_size, output_size, possible_actions) + + self.model = torch.nn.Sequential( + torch.nn.Linear(input_size, hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(hidden_size, hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(hidden_size, output_size), + ) + + def forward(self, state: torch.Tensor) -> torch.Tensor: + """Forward pass of the policy. + + Args: + state: State or batch of states. + + Returns: + Next action(s) probabilities. + """ + return self.model(state) + + def select_action(self, state: torch.Tensor) -> Tuple[int, Any]: + """Selects an action based on the current state. + + Args: + state: The current state. + + Returns: + The id of selected action and the action. + """ + with torch.no_grad(): + action = self.model(state).max(1)[1].view(1, 1) + + return action.item(), self.possible_actions[action.item()] + + def save_policy(self, path: str) -> None: + """Saves the policy to a file. + + Args: + path: The path to save the policy to. + """ + state_dict = { + "input_size": self.input_size, + "hidden_size": self.hidden_size, + "output_size": self.output_size, + "possible_actions": self.possible_actions, + "model_state_dict": self.model.state_dict(), + } + torch.save(state_dict, path) + + @classmethod + def load_policy(cls, path: str) -> DQNDialoguePolicy: + """Loads the policy from a file. + + Args: + path: The path to load the policy from. + + Returns: + The loaded policy. + """ + state_dict = torch.load(path) + policy = cls( + state_dict["input_size"], + state_dict["hidden_size"], + state_dict["output_size"], + state_dict["possible_actions"], + ) + policy.load_state_dict(state_dict["model_state_dict"]) + return policy diff --git a/moviebot/dialogue_manager/dialogue_policy/neural_dialogue_policy.py b/moviebot/dialogue_manager/dialogue_policy/neural_dialogue_policy.py new file mode 100644 index 0000000..9692166 --- /dev/null +++ b/moviebot/dialogue_manager/dialogue_policy/neural_dialogue_policy.py @@ -0,0 +1,216 @@ +"""Neural dialogue policy built on top of PyTorch.""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, List + +import torch +from sklearn.preprocessing import MultiLabelBinarizer + +from moviebot.core.intents.agent_intents import AgentIntents +from moviebot.core.intents.user_intents import UserIntents +from moviebot.dialogue_manager.dialogue_state import DialogueState + + +class NeuralDialoguePolicy(torch.nn.Module): + user_label_encoder = MultiLabelBinarizer().fit( + [list(map(lambda x: x.value.label, UserIntents))] + ) + agent_label_encoder = MultiLabelBinarizer().fit( + [list(map(lambda x: x.value.label, AgentIntents))] + ) + + def __init__( + self, + input_size: int, + hidden_size: int, + output_size: int, + possible_actions: List[Any], + ) -> None: + """Initializes the policy. + + Args: + input_size: The size of the input vector. + hidden_size: The size of the hidden layer. + output_size: The size of the output vector. + possible_actions: The list of possible actions. + """ + super(NeuralDialoguePolicy, self).__init__() + + self.possible_actions = possible_actions + + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + + @abstractmethod + def forward(self, state: torch.Tensor) -> torch.Tensor: + """Forward pass of the policy. + + Args: + state: State or batch of states. + + Raises: + NotImplementedError: If the method is not implemented in the + subclass. + Returns: + Output of the policy. + """ + raise NotImplementedError + + @abstractmethod + def select_action(self, state: torch.Tensor) -> Any: + """Selects an action based on the current state. + + Args: + state: The current state. + + Raises: + NotImplementedError: If the method is not implemented in the + subclass. + + Returns: + Selected action and optionally other information. + """ + raise NotImplementedError + + @abstractmethod + def save_policy(self, path: str) -> None: + """Saves the policy. + + Args: + path: Path to save the policy. + + Raises: + NotImplementedError: If the method is not implemented in the + subclass. + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def load_policy(cls, path: str) -> NeuralDialoguePolicy: + """Loads the policy. + + Args: + path: Path to load the policy from. + + Raises: + NotImplementedError: If the method is not implemented in the + subclass. + + Returns: + The loaded policy. + """ + raise NotImplementedError + + @classmethod + def build_input_from_dialogue_state( + cls, dialogue_state: DialogueState, **kwargs + ) -> torch.Tensor: + """Builds the input vector from the dialogue state. + + The markovian state representation is built from booleans in the + dialogue state (e.g., a recommendation was made, the agent should make + an offer, we are at the beginning of the conversation). It can be seen + as a one-hot encoding of the state. + + Args: + dialogue_state: The dialogue state. + + Returns: + Input vector for the policy (i.e., markovian state representation). + """ + dialogue_state_tensor = torch.tensor( + [ + dialogue_state.is_beginning, + dialogue_state.agent_req_filled, + dialogue_state.agent_can_lookup, + dialogue_state.agent_made_partial_offer, + dialogue_state.agent_should_make_offer, + dialogue_state.agent_made_offer, + dialogue_state.agent_offer_no_results, + dialogue_state.at_terminal_state, + ], + dtype=torch.float, + ) + return dialogue_state_tensor + + @classmethod + def _encode_intents( + cls, intents: List[Any], label_encoder: MultiLabelBinarizer + ) -> torch.Tensor: + """Encodes the intents. + + Args: + intents: Intents to encode. + label_encoder: Label encoder to use. + + Returns: + Encoded intents. + """ + if len(intents) == 0: + intents_tensor = torch.zeros( + len(label_encoder.classes_), dtype=torch.float + ) + else: + intents_tensor = torch.tensor( + label_encoder.transform( + [list(map(lambda x: x.value.label, intents))] + )[0], + dtype=torch.float, + ) + return intents_tensor + + @classmethod + def build_input_from_dialogue_state_and_intents( + cls, + dialogue_state: DialogueState, + user_intents: List[UserIntents], + agent_intents: List[AgentIntents], + **kwargs, + ) -> torch.Tensor: + """Builds the input vector from the dialogue state and previous intents. + + Args: + dialogue_state: The dialogue state. + user_intents: The user intents. + agent_intents: The agent intents. + + Returns: + The input vector. + """ + dialogue_state_tensor = cls.build_input_from_dialogue_state( + dialogue_state + ) + + user_intents_tensor = cls._encode_intents( + user_intents, cls.user_label_encoder + ) + agent_intents_tensor = cls._encode_intents( + agent_intents, cls.agent_label_encoder + ) + + return torch.cat( + [dialogue_state_tensor, user_intents_tensor, agent_intents_tensor], + dim=0, + ) + + @classmethod + def build_input( + cls, dialogue_state: DialogueState, **kwargs + ) -> torch.Tensor: + """Builds the input vector. + + Args: + dialogue_state: The dialogue state. + + Returns: + The input vector. + """ + if kwargs.get("b_use_intents", False): + return cls.build_input_from_dialogue_state_and_intents( + dialogue_state, **kwargs + ) + return cls.build_input_from_dialogue_state(dialogue_state) diff --git a/moviebot/dialogue_manager/dialogue_policy.py b/moviebot/dialogue_manager/dialogue_policy/rb_dialogue_policy.py similarity index 99% rename from moviebot/dialogue_manager/dialogue_policy.py rename to moviebot/dialogue_manager/dialogue_policy/rb_dialogue_policy.py index 177e39e..26cb398 100644 --- a/moviebot/dialogue_manager/dialogue_policy.py +++ b/moviebot/dialogue_manager/dialogue_policy/rb_dialogue_policy.py @@ -15,7 +15,7 @@ from moviebot.nlu.annotation.slots import Slots -class DialoguePolicy: +class RuleBasedDialoguePolicy: def __init__(self, isBot: bool, new_user: bool) -> None: """Loads all necessary parameters for the policy. diff --git a/moviebot/dialogue_manager/dialogue_state.py b/moviebot/dialogue_manager/dialogue_state.py index 97be530..1d06a06 100644 --- a/moviebot/dialogue_manager/dialogue_state.py +++ b/moviebot/dialogue_manager/dialogue_state.py @@ -39,14 +39,15 @@ def __init__( {} ) # previous information needs of the user in case user want to go back self.prev_agent_dacts: List[DialogueAct] = [] # list of agent dacts - self.last_agent_dacts: DialogueAct = ( - None # the current agent dact (singular, must be updated carefully) - ) + # the current agent dact (singular, must be updated carefully) + self.last_agent_dacts: DialogueAct = None self.last_user_dacts: List[DialogueAct] = None # the current user act # Keep track of the recommended movies self.movies_recommended = {} + self.is_beginning = True + def _agent_offer_state(self) -> str: """Returns string representation of the agent's offer state.""" offer_state = { @@ -142,3 +143,5 @@ def initialize(self) -> None: 3 # number of CIN slots which remain empty before agent must make ) # an offer + + self.is_beginning = True diff --git a/moviebot/dialogue_manager/dialogue_state_tracker.py b/moviebot/dialogue_manager/dialogue_state_tracker.py index febd153..eeb5e03 100644 --- a/moviebot/dialogue_manager/dialogue_state_tracker.py +++ b/moviebot/dialogue_manager/dialogue_state_tracker.py @@ -272,6 +272,7 @@ def update_state_agent(self, agent_dacts: List[DialogueAct]) -> None: agent_dacts: List of dialogue acts which is the output of dialogue policy. """ + self.dialogue_state.is_beginning = False # re-filtering the dacts agent_dacts_copy = deepcopy(agent_dacts) agent_dacts = [] diff --git a/tests/dialogue_manager/test_dialogue_manager.py b/tests/dialogue_manager/test_dialogue_manager.py index ee27df2..3aea99f 100644 --- a/tests/dialogue_manager/test_dialogue_manager.py +++ b/tests/dialogue_manager/test_dialogue_manager.py @@ -6,7 +6,9 @@ from moviebot.core.intents.agent_intents import AgentIntents from moviebot.dialogue_manager.dialogue_act import DialogueAct from moviebot.dialogue_manager.dialogue_manager import DialogueManager -from moviebot.dialogue_manager.dialogue_policy import DialoguePolicy +from moviebot.dialogue_manager.dialogue_policy.rb_dialogue_policy import ( + RuleBasedDialoguePolicy, +) from moviebot.dialogue_manager.dialogue_state import DialogueState from moviebot.dialogue_manager.dialogue_state_tracker import ( DialogueStateTracker, @@ -76,7 +78,7 @@ def test_generate_output(dialogue_manager: DialogueManager): @mock.patch.object( - DialoguePolicy, + RuleBasedDialoguePolicy, "next_action", return_value=[DialogueAct(AgentIntents.ACKNOWLEDGE)], ) @@ -109,7 +111,7 @@ def test_generate_output_with_lookup( @mock.patch.object( - DialoguePolicy, + RuleBasedDialoguePolicy, "next_action", ) @mock.patch.object(DialogueStateTracker, "update_state_agent") diff --git a/tests/dialogue_manager/test_dialogue_policy.py b/tests/dialogue_manager/test_dialogue_policy.py index b812002..c5b349c 100644 --- a/tests/dialogue_manager/test_dialogue_policy.py +++ b/tests/dialogue_manager/test_dialogue_policy.py @@ -6,7 +6,9 @@ from moviebot.core.intents.agent_intents import AgentIntents from moviebot.core.intents.user_intents import UserIntents from moviebot.dialogue_manager.dialogue_act import DialogueAct -from moviebot.dialogue_manager.dialogue_policy import DialoguePolicy +from moviebot.dialogue_manager.dialogue_policy.rb_dialogue_policy import ( + RuleBasedDialoguePolicy, +) from moviebot.dialogue_manager.dialogue_state import DialogueState from moviebot.nlu.annotation.item_constraint import ItemConstraint from moviebot.nlu.annotation.operator import Operator @@ -42,8 +44,8 @@ def state(ontology, database_results, slots) -> DialogueState: @pytest.fixture -def policy() -> DialoguePolicy: - yield DialoguePolicy(isBot=False, new_user=True) +def policy() -> RuleBasedDialoguePolicy: + yield RuleBasedDialoguePolicy(isBot=False, new_user=True) @pytest.mark.parametrize( @@ -69,7 +71,7 @@ def policy() -> DialoguePolicy: ], ) def test_next_action_basic( - policy: DialoguePolicy, + policy: RuleBasedDialoguePolicy, state: DialogueState, last_agent_dacts, last_user_dacts, @@ -82,7 +84,9 @@ def test_next_action_basic( assert agent_dacts[0].intent == expected -def test_next_action_restart(policy: DialoguePolicy, state: DialogueState): +def test_next_action_restart( + policy: RuleBasedDialoguePolicy, state: DialogueState +): agent_dacts = policy.next_action(state, restart=True) assert len(agent_dacts) == 2 assert agent_dacts[0].intent == AgentIntents.RESTART @@ -90,7 +94,7 @@ def test_next_action_restart(policy: DialoguePolicy, state: DialogueState): def test_next_action_made_partial_offer( - policy: DialoguePolicy, state: DialogueState + policy: RuleBasedDialoguePolicy, state: DialogueState ): state.agent_made_partial_offer = True @@ -105,7 +109,7 @@ def test_next_action_made_partial_offer( def test_next_action_made_partial_offer_all_slots_filled( - policy: DialoguePolicy, state: DialogueState + policy: RuleBasedDialoguePolicy, state: DialogueState ): state.agent_made_partial_offer = True state.slot_left_unasked = 10 @@ -119,7 +123,7 @@ def test_next_action_made_partial_offer_all_slots_filled( def test_next_action_should_make_offer( - policy: DialoguePolicy, state: DialogueState, database_results + policy: RuleBasedDialoguePolicy, state: DialogueState, database_results ): state.agent_should_make_offer = True state.item_in_focus = database_results[1] @@ -134,7 +138,7 @@ def test_next_action_should_make_offer( def test_next_action_inquire_empty( - policy: DialoguePolicy, state: DialogueState, database_results + policy: RuleBasedDialoguePolicy, state: DialogueState, database_results ): state.agent_made_offer = True state.item_in_focus = database_results[2] @@ -150,7 +154,7 @@ def test_next_action_inquire_empty( def test_next_action_inquire( - policy: DialoguePolicy, state: DialogueState, database_results + policy: RuleBasedDialoguePolicy, state: DialogueState, database_results ): state.agent_made_offer = True state.item_in_focus = database_results[2] @@ -170,7 +174,7 @@ def test_next_action_inquire( def test_next_action_accept_recommendation( - policy: DialoguePolicy, state: DialogueState, database_results + policy: RuleBasedDialoguePolicy, state: DialogueState, database_results ): state.agent_made_offer = True state.item_in_focus = database_results[1] @@ -206,9 +210,12 @@ def test_next_action_accept_recommendation( ], ) @mock.patch( - "moviebot.dialogue_manager.dialogue_policy.set", mock.MagicMock(wraps=list) + "moviebot.dialogue_manager.dialogue_policy.rb_dialogue_policy.set", + mock.MagicMock(wraps=list), ) -def test__generate_examples(policy: DialoguePolicy, results, slot, expected): +def test__generate_examples( + policy: RuleBasedDialoguePolicy, results, slot, expected +): random.seed(42) examples = policy._generate_examples(results, slot) assert examples == expected