diff --git a/src/learn_to_pick/__init__.py b/src/learn_to_pick/__init__.py index a6894b3..dcdb105 100644 --- a/src/learn_to_pick/__init__.py +++ b/src/learn_to_pick/__init__.py @@ -53,5 +53,4 @@ def configure_logger() -> None: "VwPolicy", "VwLogger", "embed", - "stringify_embedding", ] diff --git a/src/learn_to_pick/base.py b/src/learn_to_pick/base.py index 6612b75..e1a85aa 100644 --- a/src/learn_to_pick/base.py +++ b/src/learn_to_pick/base.py @@ -13,11 +13,13 @@ Type, TypeVar, Union, + Callable, ) from learn_to_pick.metrics import MetricsTrackerAverage, MetricsTrackerRollingWindow from learn_to_pick.model_repository import ModelRepository from learn_to_pick.vw_logger import VwLogger +from learn_to_pick.features import Featurized, DenseFeatures, SparseFeatures if TYPE_CHECKING: import vowpal_wabbit_next as vw @@ -87,10 +89,6 @@ def EmbedAndKeep(anything: Any) -> Any: # helper functions -def _stringify_embedding(embedding: List) -> str: - return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)]) - - def _parse_lines(parser: "vw.TextFormatParser", input_str: str) -> List["vw.Example"]: return [parser.parse_line(line) for line in input_str.split("\n")] @@ -108,7 +106,7 @@ def get_based_on_and_to_select_from(inputs: Dict[str, Any]) -> Tuple[Dict, Dict] ) based_on = { - k: inputs[k].value if isinstance(inputs[k].value, list) else [inputs[k].value] + k: inputs[k].value if isinstance(inputs[k].value, list) else inputs[k].value for k in inputs.keys() if isinstance(inputs[k], _BasedOn) } @@ -165,36 +163,38 @@ def __init__( model_repo: ModelRepository, vw_cmd: List[str], featurizer: Featurizer, + formatter: Callable, vw_logger: VwLogger, - *args: Any, **kwargs: Any, ): - super().__init__(*args, **kwargs) + super().__init__(**kwargs) self.model_repo = model_repo self.vw_cmd = vw_cmd self.workspace = self.model_repo.load(vw_cmd) self.featurizer = featurizer + self.formatter = formatter self.vw_logger = vw_logger + def format(self, event): + return self.formatter(*self.featurizer.featurize(event)) + def predict(self, event: TEvent) -> Any: import vowpal_wabbit_next as vw text_parser = vw.TextFormatParser(self.workspace) - return self.workspace.predict_one( - _parse_lines(text_parser, self.featurizer.format(event)) - ) + return self.workspace.predict_one(_parse_lines(text_parser, self.format(event))) def learn(self, event: TEvent) -> None: import vowpal_wabbit_next as vw - vw_ex = self.featurizer.format(event) + vw_ex = self.format(event) text_parser = vw.TextFormatParser(self.workspace) multi_ex = _parse_lines(text_parser, vw_ex) self.workspace.learn_one(multi_ex) def log(self, event: TEvent) -> None: if self.vw_logger.logging_enabled(): - vw_ex = self.featurizer.format(event) + vw_ex = self.format(event) self.vw_logger.log(vw_ex) def save(self) -> None: @@ -206,7 +206,7 @@ def __init__(self, *args: Any, **kwargs: Any): pass @abstractmethod - def format(self, event: TEvent) -> Any: + def featurize(self, event: TEvent) -> Any: ... @@ -486,70 +486,59 @@ def run(self, *args, **kwargs) -> Dict[str, Any]: def _embed_string_type( - item: Union[str, _Embed], model: Any, namespace: Optional[str] = None -) -> Dict[str, Union[str, List[str]]]: + item: Union[str, _Embed], model: Any, namespace: str +) -> Featurized: """Helper function to embed a string or an _Embed object.""" import re - keep_str = "" + result = Featurized() if isinstance(item, _Embed): - encoded = _stringify_embedding(model.encode(item.value)) - # TODO these should be moved to pick_best + result[namespace] = DenseFeatures(model.encode(item.value)) if item.keep: - keep_str = item.value.replace(" ", "_") + " " - keep_str = re.sub(r"[\t\n\r\f\v]+", " ", keep_str) + keep_str = item.value.replace(" ", "_") + result[namespace] = {"default_ft": re.sub(r"[\t\n\r\f\v]+", " ", keep_str)} elif isinstance(item, str): encoded = item.replace(" ", "_") - encoded = re.sub(r"[\t\n\r\f\v]+", " ", encoded) + result[namespace] = {"default_ft": re.sub(r"[\t\n\r\f\v]+", " ", encoded)} else: raise ValueError(f"Unsupported type {type(item)} for embedding") - if namespace is None: - raise ValueError( - "The default namespace must be provided when embedding a string or _Embed object." - ) - - return {namespace: keep_str + encoded} + return result -def _embed_dict_type(item: Dict, model: Any) -> Dict[str, Any]: +def _embed_dict_type(item: Dict, model: Any) -> Featurized: """Helper function to embed a dictionary item.""" - inner_dict: Dict = {} + result = Featurized() for ns, embed_item in item.items(): if isinstance(embed_item, list): - inner_dict[ns] = [] - for embed_list_item in embed_item: - embedded = _embed_string_type(embed_list_item, model, ns) - inner_dict[ns].append(embedded[ns]) + for idx, embed_list_item in enumerate(embed_item): + result.merge(_embed_string_type(embed_list_item, model, f"{ns}_{idx}")) else: - inner_dict.update(_embed_string_type(embed_item, model, ns)) - return inner_dict + result.merge(_embed_string_type(embed_item, model, ns)) + return result def _embed_list_type( item: list, model: Any, namespace: Optional[str] = None -) -> List[Dict[str, Union[str, List[str]]]]: - ret_list: List = [] +) -> List[Featurized]: + result = [] for embed_item in item: if isinstance(embed_item, dict): - ret_list.append(_embed_dict_type(embed_item, model)) + result.append(_embed_dict_type(embed_item, model)) elif isinstance(embed_item, list): - item_embedding = _embed_list_type(embed_item, model, namespace) - # Get the first key from the first dictionary - first_key = next(iter(item_embedding[0])) - # Group the values under that key - grouping = {first_key: [item[first_key] for item in item_embedding]} - ret_list.append(grouping) + result.append(Featurized()) + for idx, embed_list_item in enumerate(embed_item): + result[-1].merge(_embed_string_type(embed_list_item, model, f"{idx}")) else: - ret_list.append(_embed_string_type(embed_item, model, namespace)) - return ret_list + result.append(_embed_string_type(embed_item, model, namespace)) + return result def embed( to_embed: Union[Union[str, _Embed], Dict, List[Union[str, _Embed]], List[Dict]], model: Any, namespace: Optional[str] = None, -) -> List[Dict[str, Union[str, List[str]]]]: +) -> Union[Featurized, List[Featurized]]: """ Embeds the actions or context using the SentenceTransformer model (or a model that has an `encode` function) @@ -563,9 +552,9 @@ def embed( if (isinstance(to_embed, _Embed) and isinstance(to_embed.value, str)) or isinstance( to_embed, str ): - return [_embed_string_type(to_embed, model, namespace)] + return _embed_string_type(to_embed, model, namespace) elif isinstance(to_embed, dict): - return [_embed_dict_type(to_embed, model)] + return _embed_dict_type(to_embed, model) elif isinstance(to_embed, list): return _embed_list_type(to_embed, model, namespace) else: diff --git a/src/learn_to_pick/features.py b/src/learn_to_pick/features.py new file mode 100644 index 0000000..d5ded1c --- /dev/null +++ b/src/learn_to_pick/features.py @@ -0,0 +1,36 @@ +from typing import Union, Optional, Dict, List +import numpy as np + + +class SparseFeatures(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class DenseFeatures(list): + def __init__(self, *args, **kwargs): + super().__init__(np.array(*args, **kwargs)) + + +class Featurized: + def __init__( + self, + sparse: Optional[Dict[str, SparseFeatures]] = None, + dense: Optional[Dict[str, DenseFeatures]] = None, + ): + self.sparse = sparse or {} + self.dense = dense or {} + + def __setitem__(self, key, value): + if isinstance(value, Dict): + self.sparse[key] = SparseFeatures(value) + elif isinstance(value, List) or isinstance(value, np.ndarray): + self.dense[key] = DenseFeatures(value) + else: + raise ValueError( + f"Cannot convert {type(value)} to either DenseFeatures or SparseFeatures" + ) + + def merge(self, other): + self.sparse.update(other.sparse) + self.dense.update(other.dense) diff --git a/src/learn_to_pick/pick_best.py b/src/learn_to_pick/pick_best.py index e0b53fc..abf70d6 100644 --- a/src/learn_to_pick/pick_best.py +++ b/src/learn_to_pick/pick_best.py @@ -1,9 +1,10 @@ from __future__ import annotations import logging -from typing import Any, Dict, List, Optional, Tuple, Type, Union, Iterable +from typing import Any, Dict, List, Optional, Tuple, Type, Union, Callable from itertools import chain import os +import numpy as np from learn_to_pick import base @@ -38,31 +39,64 @@ def __init__( based_on: Dict[str, Any], selected: Optional[PickBestSelected] = None, ): - super().__init__(inputs=inputs, selected=selected) + super().__init__(inputs=inputs, selected=selected or PickBestSelected()) self.to_select_from = to_select_from self.based_on = based_on + def context(self, model) -> base.Featurized: + return base.embed(self.based_on or {}, model) + + def actions(self, model) -> List[base.Featurized]: + to_select_from_var_name, to_select_from = next( + iter(self.to_select_from.items()), (None, None) + ) + + action_embs = ( + ( + base.embed(to_select_from, model, to_select_from_var_name) + if self.to_select_from + else None + ) + if to_select_from + else None + ) + if not action_embs: + raise ValueError( + "Context and to_select_from must be provided in the inputs dictionary" + ) + return action_embs + class VwTxt: @staticmethod - def embedding(embedding: List[float]) -> str: - return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)]) + def _dense_2_str(values: base.DenseFeatures) -> str: + return " ".join([f"{i}:{e}" for i, e in enumerate(values)]) @staticmethod - def features(features: Union[str, List[str]]) -> str: - return " ".join(features) if isinstance(features, list) else features + def _sparse_2_str(values: base.SparseFeatures) -> str: + def _to_str(v): + import numbers - @staticmethod - def _namespaces(ns: Iterable[Tuple[str, Union[str, List[str]]]]): - return " ".join(f"|{k} {VwTxt.features(v)}" for k, v in ns) + return v if isinstance(v, numbers.Number) else f"={v}" + + return " ".join([f"{k}:{_to_str(v)}" for k, v in values.items()]) @staticmethod - def ns(ns: Union[Iterable[Tuple[str, Any]], List[Dict[str, Any]], Dict[str, Any]]): - if isinstance(ns, List): - ns = chain.from_iterable(map(dict.items, ns)) - if isinstance(ns, Dict): - ns = ns.items() - return VwTxt._namespaces(ns) + def featurized_2_str(obj: base.Featurized) -> str: + return " ".join( + chain.from_iterable( + [ + map( + lambda kv: f"|{kv[0]}_dense {VwTxt._dense_2_str(kv[1])}", + obj.dense.items(), + ), + map( + lambda kv: f"|{kv[0]}_sparse {VwTxt._sparse_2_str(kv[1])}", + obj.sparse.items(), + ), + ] + ) + ) class PickBestFeaturizer(base.Featurizer[PickBestEvent]): @@ -86,141 +120,71 @@ def __init__( self.model = model self.auto_embed = auto_embed - def get_label(self, event: PickBestEvent) -> tuple: - cost = None - if event.selected: - chosen_action = event.selected.index - cost = ( - -1.0 * event.selected.score - if event.selected.score is not None - else None - ) - prob = event.selected.probability - return chosen_action, cost, prob - else: - return None, None, None - - def get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple: - context_emb = base.embed(event.based_on, self.model) if event.based_on else None - to_select_from_var_name, to_select_from = next( - iter(event.to_select_from.items()), (None, None) - ) - - action_embs = ( - ( - base.embed(to_select_from, self.model, to_select_from_var_name) - if event.to_select_from - else None - ) - if to_select_from - else None - ) - - if not context_emb or not action_embs: - raise ValueError( - "Context and to_select_from must be provided in the inputs dictionary" - ) - return context_emb, action_embs + def _dotproducts(self, context, actions): + _context_dense = base.Featurized() + for ns in context.sparse.keys(): + if "default_ft" in context.sparse[ns]: + _context_dense[ns] = self.model.encode(context.sparse[ns]["default_ft"]) + + _actions_dense = [base.Featurized() for _ in range(len(actions))] + for _action, action in zip(_actions_dense, actions): + for ns in action.sparse.keys(): + if "default_ft" in action.sparse[ns]: + _action[ns] = self.model.encode(action.sparse[ns]["default_ft"]) + + context_names = list(_context_dense.dense.keys()) + context_matrix = np.stack(list(_context_dense.dense.values())) + for _a, a in zip(_actions_dense, actions): + action_names = list(_a.dense.keys()) + product = np.dot(context_matrix, np.stack(list(_a.dense.values())).T) + a["dotprod"] = { + f"{context_names[i]}_{action_names[j]}": product[i, j] + for i in range(len(context_names)) + for j in range(len(action_names)) + } - def get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict: - import numpy as np - - unique_contexts = set() - for context_item in context_emb: - for ns, ee in context_item.items(): - if isinstance(ee, list): - for ea in ee: - unique_contexts.add(f"{ns}={ea}") - else: - unique_contexts.add(f"{ns}={ee}") - - encoded_contexts = self.model.encode(list(unique_contexts)) - context_embeddings = dict(zip(unique_contexts, encoded_contexts)) - - unique_actions = set() - for action in action_embs: - for ns, e in action.items(): - if isinstance(e, list): - for ea in e: - unique_actions.add(f"{ns}={ea}") - else: - unique_actions.add(f"{ns}={e}") - - encoded_actions = self.model.encode(list(unique_actions)) - action_embeddings = dict(zip(unique_actions, encoded_actions)) - - action_matrix = np.stack([v for k, v in action_embeddings.items()]) - context_matrix = np.stack([v for k, v in context_embeddings.items()]) - dot_product_matrix = np.dot(context_matrix, action_matrix.T) - - indexed_dot_product: Dict = {} - - for i, context_key in enumerate(context_embeddings.keys()): - indexed_dot_product[context_key] = {} - for j, action_key in enumerate(action_embeddings.keys()): - indexed_dot_product[context_key][action_key] = dot_product_matrix[i, j] - - return indexed_dot_product - - def format_auto_embed_on(self, event: PickBestEvent) -> str: - chosen_action, cost, prob = self.get_label(event) - context_emb, action_embs = self.get_context_and_action_embeddings(event) - indexed_dot_product = self.get_indexed_dot_product(context_emb, action_embs) - - nactions = len(action_embs) - - def _tolist(v): - return v if isinstance(v, list) else [v] - - labels = ["" for _ in range(nactions)] - if cost is not None: - labels[chosen_action] = f"{chosen_action}:{cost}:{prob} " - - dotprods = [{} for _ in range(nactions)] - for i, action in enumerate(action_embs): - action["#"] = [f"{k}={v}" for k, _v in action.items() for v in _tolist(_v)] - dotprods[i] = [ - v[f] for v in indexed_dot_product.values() for f in action["#"] - ] - - actions_str = [ - f"{l}{VwTxt.ns(a)} |dotprod {VwTxt.embedding(dp)}" - for l, a, dp in zip(labels, action_embs, dotprods) - ] + @staticmethod + def _generic_namespace(featurized): + result = base.SparseFeatures() + for ns in featurized.sparse.keys(): + if "default_ft" in featurized.sparse[ns]: + result[ns] = featurized.sparse[ns]["default_ft"] + return result - for item in context_emb: - item["@"] = [f"{k}={v}" for k, _v in item.items() for v in _tolist(_v)] - shared_str = f"shared {VwTxt.ns(context_emb)}" + @staticmethod + def _generic_namespaces(context, actions): + context["@"] = PickBestFeaturizer._generic_namespace(context) + for a in actions: + a["#"] = PickBestFeaturizer._generic_namespace(a) - return "\n".join([shared_str] + actions_str) + def featurize( + self, event: PickBestEvent + ) -> Tuple[base.Featurized, List[base.Featurized], PickBestSelected]: + context = event.context(self.model) + actions = event.actions(self.model) - def format_auto_embed_off(self, event: PickBestEvent) -> str: - """ - Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW - """ - chosen_action, cost, prob = self.get_label(event) - context_emb, action_embs = self.get_context_and_action_embeddings(event) - nactions = len(action_embs) + if self.auto_embed: + self._dotproducts(context, actions) + PickBestFeaturizer._generic_namespaces(context, actions) - context_str = f"shared {VwTxt.ns(context_emb)}" + return context, actions, event.selected - labels = ["" for _ in range(nactions)] - if cost is not None: - labels[chosen_action] = f"{chosen_action}:{cost}:{prob} " - actions_str = [f"{l}{VwTxt.ns(a)}" for a, l in zip(action_embs, labels)] - return "\n".join([context_str] + actions_str) - def format(self, event: PickBestEvent) -> str: - if self.auto_embed: - return self.format_auto_embed_on(event) - else: - return self.format_auto_embed_off(event) +def vw_cb_formatter( + context: base.Featurized, actions: List[base.Featurized], selected: PickBestSelected +) -> str: + nactions = len(actions) + context_str = f"shared {VwTxt.featurized_2_str(context)}" + labels = ["" for _ in range(nactions)] + if selected.score is not None: + labels[ + selected.index + ] = f"{selected.index}:{-selected.score}:{selected.probability} " + actions_str = [f"{l}{VwTxt.featurized_2_str(a)}" for a, l in zip(actions, labels)] + return "\n".join([context_str] + actions_str) class PickBestRandomPolicy(base.Policy[PickBestEvent]): - def __init__(self): - ... - def predict(self, event: PickBestEvent) -> List[Tuple[int, float]]: num_items = len(event.to_select_from) return [(i, 1.0 / num_items) for i in range(num_items)] @@ -294,19 +258,12 @@ def _call_after_predict_before_scoring( sampled_ap = prediction[sampled_index] sampled_action = sampled_ap[0] sampled_prob = sampled_ap[1] - selected = PickBestSelected(index=sampled_action, probability=sampled_prob) - event.selected = selected + event.selected = PickBestSelected( + index=sampled_action, probability=sampled_prob + ) next_inputs = inputs.copy() - # only one key, value pair in event.to_select_from - value = next(iter(event.to_select_from.values())) - v = ( - value[event.selected.index] - if event.selected - else event.to_select_from.values() - ) - picked = {} for k, v in event.to_select_from.items(): picked[k] = v[event.selected.index] @@ -363,13 +320,14 @@ def create( @staticmethod def create_policy( featurizer: Optional[base.Featurizer] = None, + formatter: Optional[Callable] = None, vw_cmd: Optional[List[str]] = None, model_save_dir: str = "./", reset_model: bool = False, rl_logs: Optional[Union[str, os.PathLike]] = None, ): - if not featurizer: - featurizer = PickBestFeaturizer(auto_embed=False) + featurizer = featurizer or PickBestFeaturizer(auto_embed=False) + formatter = formatter or vw_cb_formatter vw_cmd = vw_cmd or [] interactions = [] @@ -397,6 +355,7 @@ def create_policy( ), vw_cmd=vw_cmd, featurizer=featurizer, + formatter=formatter, vw_logger=base.VwLogger(rl_logs), ) diff --git a/tests/unit_tests/test_pick_best_call.py b/tests/unit_tests/test_pick_best_call.py index c2ef16e..d35d4b2 100644 --- a/tests/unit_tests/test_pick_best_call.py +++ b/tests/unit_tests/test_pick_best_call.py @@ -5,6 +5,7 @@ import learn_to_pick import learn_to_pick.base as rl_loop +from learn_to_pick.pick_best import vw_cb_formatter encoded_keyword = "[encoded]" @@ -161,15 +162,19 @@ def test_everything_embedded() -> None: str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_loop._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_loop._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_loop._stringify_embedding(list(encoded_keyword + str3)) + action_dense = "0:1.0 1:0.0" ctx_str_1 = "context1" + encoded_ctx_str_1 = "0:8.0 1:0.0" - encoded_ctx_str_1 = rl_loop._stringify_embedding(list(encoded_keyword + ctx_str_1)) - - expected = f"""shared |User {ctx_str_1 + " " + encoded_ctx_str_1} \n|action {str1 + " " + encoded_str1} \n|action {str2 + " " + encoded_str2} \n|action {str3 + " " + encoded_str3} """ # noqa + expected = "\n".join( + [ + f"shared |User_dense {encoded_ctx_str_1} |User_sparse default_ft:={ctx_str_1}", + f"|action_dense {action_dense} |action_sparse default_ft:={str1}", + f"|action_dense {action_dense} |action_sparse default_ft:={str2}", + f"|action_dense {action_dense} |action_sparse default_ft:={str3}", + ] + ) # noqa actions = [str1, str2, str3] @@ -178,7 +183,7 @@ def test_everything_embedded() -> None: action=rl_loop.EmbedAndKeep(learn_to_pick.ToSelectFrom(actions)), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) @@ -191,7 +196,14 @@ def test_default_auto_embedder_is_off() -> None: str3 = "2" ctx_str_1 = "context1" - expected = f"""shared |User {ctx_str_1} \n|action {str1} \n|action {str2} \n|action {str3} """ # noqa + expected = "\n".join( + [ + f"shared |User_sparse default_ft:={ctx_str_1}", + f"|action_sparse default_ft:={str1}", + f"|action_sparse default_ft:={str2}", + f"|action_sparse default_ft:={str3}", + ] + ) # noqa actions = [str1, str2, str3] @@ -200,7 +212,7 @@ def test_default_auto_embedder_is_off() -> None: action=learn_to_pick.base.ToSelectFrom(actions), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) @@ -213,7 +225,14 @@ def test_default_w_embeddings_off() -> None: str3 = "2" ctx_str_1 = "context1" - expected = f"""shared |User {ctx_str_1} \n|action {str1} \n|action {str2} \n|action {str3} """ # noqa + expected = "\n".join( + [ + f"shared |User_sparse default_ft:={ctx_str_1}", + f"|action_sparse default_ft:={str1}", + f"|action_sparse default_ft:={str2}", + f"|action_sparse default_ft:={str3}", + ] + ) # noqa actions = [str1, str2, str3] @@ -222,7 +241,7 @@ def test_default_w_embeddings_off() -> None: action=learn_to_pick.ToSelectFrom(actions), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) @@ -235,9 +254,15 @@ def test_default_w_embeddings_on() -> None: str1 = "0" str2 = "1" ctx_str_1 = "context1" - dot_prod = "dotprod 0:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0] + dot_prod = "dotprod_sparse User_action:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0] - expected = f"""shared |User {ctx_str_1} |@ User={ctx_str_1}\n|action {str1} |# action={str1} |{dot_prod}\n|action {str2} |# action={str2} |{dot_prod}""" # noqa + expected = "\n".join( + [ + f"shared |User_sparse default_ft:={ctx_str_1} |@_sparse User:={ctx_str_1}", + f"|action_sparse default_ft:={str1} |{dot_prod} |#_sparse action:={str1} ", + f"|action_sparse default_ft:={str2} |{dot_prod} |#_sparse action:={str2} ", + ] + ) # noqa actions = [str1, str2] @@ -246,7 +271,7 @@ def test_default_w_embeddings_on() -> None: action=learn_to_pick.ToSelectFrom(actions), ) picked_metadata = response["picked_metadata"] # type: ignore - vw_str = featurizer.format(picked_metadata) # type: ignore + vw_str = vw_cb_formatter(*featurizer.featurize(picked_metadata)) # type: ignore assert_vw_ex_equals(vw_str, expected) diff --git a/tests/unit_tests/test_pick_best_text_embedder.py b/tests/unit_tests/test_pick_best_text_embedder.py index 46c4f9e..b5aafd8 100644 --- a/tests/unit_tests/test_pick_best_text_embedder.py +++ b/tests/unit_tests/test_pick_best_text_embedder.py @@ -3,11 +3,10 @@ import learn_to_pick.base as rl_chain import learn_to_pick.pick_best as pick_best_chain +from learn_to_pick.pick_best import vw_cb_formatter -encoded_keyword = "[encoded]" - -def test_pickbest_textembedder_missing_context_throws() -> None: +def test_pickbest_textembedder_missing_context_not_throws() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) @@ -15,8 +14,7 @@ def test_pickbest_textembedder_missing_context_throws() -> None: event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_action, based_on={} ) - with pytest.raises(ValueError): - featurizer.format(event) + featurizer.featurize(event) def test_pickbest_textembedder_missing_actions_throws() -> None: @@ -27,19 +25,27 @@ def test_pickbest_textembedder_missing_actions_throws() -> None: inputs={}, to_select_from={}, based_on={"context": "context"} ) with pytest.raises(ValueError): - featurizer.format(event) + featurizer.featurize(event) def test_pickbest_textembedder_no_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": ["0", "1", "2"]} - expected = """shared |context context \n|action1 0 \n|action1 1 \n|action1 2 """ + named_actions = {"action": ["0", "1", "2"]} + expected = "\n".join( + [ + "shared |context_sparse default_ft:=context", + "|action_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", + ] + ) + event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on={"context": "context"} ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -47,8 +53,15 @@ def test_pickbest_textembedder_w_label_no_score_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": ["0", "1", "2"]} - expected = """shared |context context \n|action1 0 \n|action1 1 \n|action1 2 """ + named_actions = {"action": ["0", "1", "2"]} + expected = "\n".join( + [ + "shared |context_sparse default_ft:=context", + "|action_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", + ] + ) selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) event = pick_best_chain.PickBestEvent( inputs={}, @@ -56,7 +69,7 @@ def test_pickbest_textembedder_w_label_no_score_no_emb() -> None: based_on={"context": "context"}, selected=selected, ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -64,10 +77,16 @@ def test_pickbest_textembedder_w_full_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": ["0", "1", "2"]} - expected = ( - """shared |context context \n0:-0.0:1.0 |action1 0 \n|action1 1 \n|action1 2 """ + named_actions = {"action": ["0", "1", "2"]} + expected = "\n".join( + [ + "shared |context_sparse default_ft:=context", + "0:-0.0:1.0 |action_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", + ] ) + selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, @@ -75,7 +94,7 @@ def test_pickbest_textembedder_w_full_label_no_emb() -> None: based_on={"context": "context"}, selected=selected, ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -86,21 +105,25 @@ def test_pickbest_textembedder_w_full_label_w_emb() -> None: str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) + ctx_str = "ctx" + encoded_ctx_str = "0:3.0 1:0.0" - named_actions = {"action1": rl_chain.Embed([str1, str2, str3])} - context = {"context": rl_chain.Embed(ctx_str_1)} - expected = f"""shared |context {encoded_ctx_str_1} \n0:-0.0:1.0 |action1 {encoded_str1} \n|action1 {encoded_str2} \n|action1 {encoded_str3} """ # noqa: E501 + named_actions = {"action": rl_chain.Embed([str1, str2, str3])} + context = {"context": rl_chain.Embed(ctx_str)} + expected = "\n".join( + [ + f"shared |context_dense {encoded_ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0", + "|action_dense 0:1.0 1:0.0", + "|action_dense 0:1.0 1:0.0", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -111,21 +134,25 @@ def test_pickbest_textembedder_w_full_label_w_embed_and_keep() -> None: str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) + ctx_str = "ctx" + encoded_ctx_str = "0:3.0 1:0.0" - named_actions = {"action1": rl_chain.EmbedAndKeep([str1, str2, str3])} - context = {"context": rl_chain.EmbedAndKeep(ctx_str_1)} - expected = f"""shared |context {ctx_str_1 + " " + encoded_ctx_str_1} \n0:-0.0:1.0 |action1 {str1 + " " + encoded_str1} \n|action1 {str2 + " " + encoded_str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501 + named_actions = {"action": rl_chain.EmbedAndKeep([str1, str2, str3])} + context = {"context": rl_chain.EmbedAndKeep(ctx_str)} + expected = "\n".join( + [ + f"shared |context_dense {encoded_ctx_str} |context_sparse default_ft:={ctx_str}", + "0:-0.0:1.0 |action_dense 0:1.0 1:0.0 |action_sparse default_ft:=0", + "|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=1", + "|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -135,11 +162,18 @@ def test_pickbest_textembedder_more_namespaces_no_label_no_emb() -> None: ) named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = """shared |context1 context1 |context2 context2 \n|a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501 + expected = "\n".join( + [ + "shared |context1_sparse default_ft:=context1 |context2_sparse default_ft:=context2 ", + "|a_sparse default_ft:=0 |b_sparse default_ft:=0", + "|action1_sparse default_ft:=1", + "|action1_sparse default_ft:=2", + ] + ) # noqa: E501 event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -147,14 +181,21 @@ def test_pickbest_textembedder_more_namespaces_w_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} + named_actions = {"action": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = """shared |context1 context1 |context2 context2 \n|a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501 + expected = "\n".join( + [ + "shared |context1_sparse default_ft:=context1 |context2_sparse default_ft:=context2", + "|a_sparse default_ft:=0 |b_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -162,14 +203,21 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_no_emb() -> None: featurizer = pick_best_chain.PickBestFeaturizer( auto_embed=False, model=MockEncoder() ) - named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]} + named_actions = {"action": [{"a": "0", "b": "0"}, "1", "2"]} context = {"context1": "context1", "context2": "context2"} - expected = """shared |context1 context1 |context2 context2 \n0:-0.0:1.0 |a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501 + expected = "\n".join( + [ + "shared |context1_sparse default_ft:=context1 |context2_sparse default_ft:=context2", + "0:-0.0:1.0 |a_sparse default_ft:=0 |b_sparse default_ft:=0", + "|action_sparse default_ft:=1", + "|action_sparse default_ft:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -181,27 +229,31 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_emb() -> None str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_1 = "0:3.0 1:0.0" + encoded_ctx_str_2 = "0:4.0 1:0.0" - named_actions = {"action1": rl_chain.Embed([{"a": str1, "b": str1}, str2, str3])} + named_actions = {"action": rl_chain.Embed([{"a": str1, "b": str1}, str2, str3])} context = { "context1": rl_chain.Embed(ctx_str_1), "context2": rl_chain.Embed(ctx_str_2), } - expected = f"""shared |context1 {encoded_ctx_str_1} |context2 {encoded_ctx_str_2} \n0:-0.0:1.0 |a {encoded_str1} |b {encoded_str1} \n|action1 {encoded_str2} \n|action1 {encoded_str3} """ # noqa: E501 + expected = "\n".join( + [ + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0", + f"|action_dense 0:1.0 1:0.0", + f"|action_dense 0:1.0 1:0.0", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -215,29 +267,33 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_embed_and_kee str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = rl_chain._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_1 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_1)) - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_1 = "0:3.0 1:0.0" + encoded_ctx_str_2 = "0:4.0 1:0.0" named_actions = { - "action1": rl_chain.EmbedAndKeep([{"a": str1, "b": str1}, str2, str3]) + "action": rl_chain.EmbedAndKeep([{"a": str1, "b": str1}, str2, str3]) } context = { "context1": rl_chain.EmbedAndKeep(ctx_str_1), "context2": rl_chain.EmbedAndKeep(ctx_str_2), } - expected = f"""shared |context1 {ctx_str_1 + " " + encoded_ctx_str_1} |context2 {ctx_str_2 + " " + encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1 + " " + encoded_str1} |b {str1 + " " + encoded_str1} \n|action1 {str2 + " " + encoded_str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501 + expected = "\n".join( + [ + f"shared |context1_dense {encoded_ctx_str_1} |context2_dense {encoded_ctx_str_2} |context1_sparse default_ft:={ctx_str_1} |context2_sparse default_ft:={ctx_str_2}", + f"0:-0.0:1.0 |a_dense 0:1.0 1:0.0 |b_dense 0:1.0 1:0.0 |a_sparse default_ft:=0 |b_sparse default_ft:=0", + f"|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -249,24 +305,30 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emb() -> N str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_2 = "0:4.0 1:0.0" named_actions = { - "action1": [{"a": str1, "b": rl_chain.Embed(str1)}, str2, rl_chain.Embed(str3)] + "action": [{"a": str1, "b": rl_chain.Embed(str1)}, str2, rl_chain.Embed(str3)] } context = {"context1": ctx_str_1, "context2": rl_chain.Embed(ctx_str_2)} - expected = f"""shared |context1 {ctx_str_1} |context2 {encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1} |b {encoded_str1} \n|action1 {str2} \n|action1 {encoded_str3} """ # noqa: E501 + + expected = "\n".join( + [ + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse default_ft:={ctx_str_1}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse default_ft:=0", + f"|action_sparse default_ft:=1", + f"|action_dense 0:1.0 1:0.0", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -278,28 +340,32 @@ def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emakeep() str1 = "0" str2 = "1" str3 = "2" - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) - encoded_str3 = rl_chain._stringify_embedding(list(encoded_keyword + str3)) - ctx_str_1 = "context1" - ctx_str_2 = "context2" - encoded_ctx_str_2 = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str_2)) + ctx_str_1 = "ctx" + ctx_str_2 = "ctx_" + encoded_ctx_str_2 = "0:4.0 1:0.0" named_actions = { - "action1": [ + "action": [ {"a": str1, "b": rl_chain.EmbedAndKeep(str1)}, str2, rl_chain.EmbedAndKeep(str3), ] } context = {"context1": ctx_str_1, "context2": rl_chain.EmbedAndKeep(ctx_str_2)} - expected = f"""shared |context1 {ctx_str_1} |context2 {ctx_str_2 + " " + encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1} |b {str1 + " " + encoded_str1} \n|action1 {str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501 - + expected = "\n".join( + [ + f"shared |context2_dense {encoded_ctx_str_2} |context1_sparse default_ft:={ctx_str_1} |context2_sparse default_ft:={ctx_str_2}", + f"0:-0.0:1.0 |b_dense 0:1.0 1:0.0 |a_sparse default_ft:=0 |b_sparse default_ft:=0", + f"|action_sparse default_ft:=1", + f"|action_dense 0:1.0 1:0.0 |action_sparse default_ft:=2", + ] + ) # noqa: E501 selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context, selected=selected ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected) @@ -309,40 +375,51 @@ def test_raw_features_underscored() -> None: ) str1 = "this is a long string" str1_underscored = str1.replace(" ", "_") - encoded_str1 = rl_chain._stringify_embedding(list(encoded_keyword + str1)) + encoded_str1 = f"0:{float(len(str1))} 1:0.0" ctx_str = "this is a long context" ctx_str_underscored = ctx_str.replace(" ", "_") - encoded_ctx_str = rl_chain._stringify_embedding(list(encoded_keyword + ctx_str)) + encoded_ctx_str = f"0:{float(len(ctx_str))} 1:0.0" # No embeddings named_actions = {"action": [str1]} context = {"context": ctx_str} - expected_no_embed = ( - f"""shared |context {ctx_str_underscored} \n|action {str1_underscored} """ + expected_no_embed = "\n".join( + [ + f"shared |context_sparse default_ft:={ctx_str_underscored}", + f"|action_sparse default_ft:={str1_underscored}", + ] ) + event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected_no_embed) # Just embeddings named_actions = {"action": rl_chain.Embed([str1])} context = {"context": rl_chain.Embed(ctx_str)} - expected_embed = f"""shared |context {encoded_ctx_str} \n|action {encoded_str1} """ + expected_embed = "\n".join( + [f"shared |context_dense {encoded_ctx_str}", f"|action_dense {encoded_str1}"] + ) event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected_embed) # Embeddings and raw features named_actions = {"action": rl_chain.EmbedAndKeep([str1])} context = {"context": rl_chain.EmbedAndKeep(ctx_str)} - expected_embed_and_keep = f"""shared |context {ctx_str_underscored + " " + encoded_ctx_str} \n|action {str1_underscored + " " + encoded_str1} """ # noqa: E501 + expected_embed_and_keep = "\n".join( + [ + f"shared |context_dense {encoded_ctx_str} |context_sparse default_ft:={ctx_str_underscored}", + f"|action_dense {encoded_str1} |action_sparse default_ft:={str1_underscored}", + ] + ) # noqa: E501 event = pick_best_chain.PickBestEvent( inputs={}, to_select_from=named_actions, based_on=context ) - vw_ex_str = featurizer.format(event) + vw_ex_str = vw_cb_formatter(*featurizer.featurize(event)) assert_vw_ex_equals(vw_ex_str, expected_embed_and_keep) diff --git a/tests/unit_tests/test_rl_loop_base_embedder.py b/tests/unit_tests/test_rl_loop_base_embedder.py index af2e2b5..18a259a 100644 --- a/tests/unit_tests/test_rl_loop_base_embedder.py +++ b/tests/unit_tests/test_rl_loop_base_embedder.py @@ -5,368 +5,411 @@ import learn_to_pick.base as base -encoded_keyword = "[encoded]" - def test_simple_context_str_no_emb() -> None: - expected = [{"a_namespace": "test"}] - assert base.embed("test", MockEncoder(), "a_namespace") == expected + expected = {"a_namespace": {"default_ft": "test"}} + + featurized = base.embed("test", MockEncoder(), "a_namespace") + assert featurized.sparse == expected + assert featurized.dense == {} def test_simple_context_str_w_emb() -> None: str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"a_namespace": encoded_str1}] - assert base.embed(base.Embed(str1), MockEncoder(), "a_namespace") == expected - expected_embed_and_keep = [{"a_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed(base.EmbedAndKeep(str1), MockEncoder(), "a_namespace") - == expected_embed_and_keep - ) + expected_dense = {"a_namespace": [4.0, 0.0]} + expected_sparse = {"a_namespace": {"default_ft": str1}} + + featurized = base.embed(base.Embed(str1), MockEncoder(), "a_namespace") + assert featurized.dense == expected_dense + assert featurized.sparse == {} + + featurized = base.embed(base.EmbedAndKeep(str1), MockEncoder(), "a_namespace") + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_simple_context_str_w_nested_emb() -> None: # nested embeddings, innermost wins str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"a_namespace": encoded_str1}] - assert ( - base.embed(base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace") - == expected + expected_dense = {"a_namespace": [4.0, 0.0]} + expected_sparse = {"a_namespace": {"default_ft": str1}} + + featurized = base.embed( + base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace" ) + assert featurized.dense == expected_dense + assert featurized.sparse == {} - expected2 = [{"a_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed(base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace") - == expected2 + featurized = base.embed( + base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace" ) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_context_w_namespace_no_emb() -> None: - expected = [{"test_namespace": "test"}] - assert base.embed({"test_namespace": "test"}, MockEncoder()) == expected + expected_sparse = {"test_namespace": {"default_ft": "test"}} + featurized = base.embed({"test_namespace": "test"}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == {} def test_context_w_namespace_w_emb() -> None: str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"test_namespace": encoded_str1}] - assert base.embed({"test_namespace": base.Embed(str1)}, MockEncoder()) == expected - expected_embed_and_keep = [{"test_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed({"test_namespace": base.EmbedAndKeep(str1)}, MockEncoder()) - == expected_embed_and_keep - ) + expected_sparse = {"test_namespace": {"default_ft": str1}} + expected_dense = {"test_namespace": [4.0, 0.0]} + + featurized = base.embed({"test_namespace": base.Embed(str1)}, MockEncoder()) + assert featurized.sparse == {} + assert featurized.dense == expected_dense + + featurized = base.embed({"test_namespace": base.EmbedAndKeep(str1)}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_context_w_namespace_w_emb2() -> None: str1 = "test" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - expected = [{"test_namespace": encoded_str1}] - assert base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) == expected - expected_embed_and_keep = [{"test_namespace": str1 + " " + encoded_str1}] - assert ( - base.embed(base.EmbedAndKeep({"test_namespace": str1}), MockEncoder()) - == expected_embed_and_keep - ) + expected_sparse = {"test_namespace": {"default_ft": str1}} + expected_dense = {"test_namespace": [4.0, 0.0]} + + featurized = base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) + assert featurized.sparse == {} + assert featurized.dense == expected_dense + + featurized = base.embed(base.EmbedAndKeep({"test_namespace": str1}), MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_context_w_namespace_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - expected = [{"test_namespace": str1, "test_namespace2": encoded_str2}] - assert ( - base.embed( - {"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder() - ) - == expected + str1 = "test" + str2 = "test_" + expected_sparse = {"test_namespace": {"default_ft": str1}} + expected_dense = {"test_namespace2": [5.0, 0.0]} + featurized = base.embed( + {"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder() ) - expected_embed_and_keep = [ - {"test_namespace": str1, "test_namespace2": str2 + " " + encoded_str2} - ] - assert ( - base.embed( - {"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)}, - MockEncoder(), - ) - == expected_embed_and_keep + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense + + expected_sparse = { + "test_namespace": {"default_ft": str1}, + "test_namespace2": {"default_ft": str2}, + } + featurized = base.embed( + {"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)}, + MockEncoder(), ) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_simple_action_strlist_no_emb() -> None: str1 = "test1" str2 = "test2" str3 = "test3" - expected = [{"a_namespace": str1}, {"a_namespace": str2}, {"a_namespace": str3}] + expected_sparse = [ + {"a_namespace": {"default_ft": str1}}, + {"a_namespace": {"default_ft": str2}}, + {"a_namespace": {"default_ft": str3}}, + ] to_embed: List[Union[str, base._Embed]] = [str1, str2, str3] - assert base.embed(to_embed, MockEncoder(), "a_namespace") == expected + featurized = base.embed(to_embed, MockEncoder(), "a_namespace") + + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == {} def test_simple_action_strlist_w_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"a_namespace": encoded_str1}, - {"a_namespace": encoded_str2}, - {"a_namespace": encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + + expected_sparse = [ + {"a_namespace": {"default_ft": str1}}, + {"a_namespace": {"default_ft": str2}}, + {"a_namespace": {"default_ft": str3}}, ] - assert ( - base.embed(base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace") - == expected - ) - expected_embed_and_keep = [ - {"a_namespace": str1 + " " + encoded_str1}, - {"a_namespace": str2 + " " + encoded_str2}, - {"a_namespace": str3 + " " + encoded_str3}, + expected_dense = [ + {"a_namespace": [4.0, 0.0]}, + {"a_namespace": [5.0, 0.0]}, + {"a_namespace": [6.0, 0.0]}, ] - assert ( - base.embed(base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace") - == expected_embed_and_keep + + featurized = base.embed( + base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace" ) + for i in range(len(featurized)): + assert featurized[i].sparse == {} + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed( + base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace" + ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_simple_action_strlist_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"a_namespace": str1}, - {"a_namespace": encoded_str2}, - {"a_namespace": encoded_str3}, - ] - assert ( - base.embed( - [str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace" - ) - == expected + str1 = "test" + str2 = "test_" + str3 = "test__" + + expected_sparse = [{"a_namespace": {"default_ft": str1}}, {}, {}] + expected_dense = [{}, {"a_namespace": [5.0, 0.0]}, {"a_namespace": [6.0, 0.0]}] + featurized = base.embed( + [str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace" ) - expected_embed_and_keep = [ - {"a_namespace": str1}, - {"a_namespace": str2 + " " + encoded_str2}, - {"a_namespace": str3 + " " + encoded_str3}, - ] - assert ( - base.embed( - [str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)], - MockEncoder(), - "a_namespace", - ) - == expected_embed_and_keep + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed( + [str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)], + MockEncoder(), + "a_namespace", ) + expected_sparse = [ + {"a_namespace": {"default_ft": str1}}, + {"a_namespace": {"default_ft": str2}}, + {"a_namespace": {"default_ft": str3}}, + ] + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_no_emb() -> None: str1 = "test1" str2 = "test2" str3 = "test3" - expected = [ - {"test_namespace": str1}, - {"test_namespace": str2}, - {"test_namespace": str3}, + expected_sparse = [ + {"test_namespace": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}}, ] - assert ( - base.embed( - [ - {"test_namespace": str1}, - {"test_namespace": str2}, - {"test_namespace": str3}, - ], - MockEncoder(), - ) - == expected + + featurized = base.embed( + [ + {"test_namespace": str1}, + {"test_namespace": str2}, + {"test_namespace": str3}, + ], + MockEncoder(), ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == {} def test_action_w_namespace_w_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace": encoded_str1}, - {"test_namespace": encoded_str2}, - {"test_namespace": encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}}, ] - assert ( - base.embed( - [ - {"test_namespace": base.Embed(str1)}, - {"test_namespace": base.Embed(str2)}, - {"test_namespace": base.Embed(str3)}, - ], - MockEncoder(), - ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace": str1 + " " + encoded_str1}, - {"test_namespace": str2 + " " + encoded_str2}, - {"test_namespace": str3 + " " + encoded_str3}, + expected_dense = [ + {"test_namespace": [4.0, 0.0]}, + {"test_namespace": [5.0, 0.0]}, + {"test_namespace": [6.0, 0.0]}, ] - assert ( - base.embed( - [ - {"test_namespace": base.EmbedAndKeep(str1)}, - {"test_namespace": base.EmbedAndKeep(str2)}, - {"test_namespace": base.EmbedAndKeep(str3)}, - ], - MockEncoder(), - ) - == expected_embed_and_keep + + featurized = base.embed( + [ + {"test_namespace": base.Embed(str1)}, + {"test_namespace": base.Embed(str2)}, + {"test_namespace": base.Embed(str3)}, + ], + MockEncoder(), + ) + for i in range(len(featurized)): + assert featurized[i].sparse == {} + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed( + [ + {"test_namespace": base.EmbedAndKeep(str1)}, + {"test_namespace": base.EmbedAndKeep(str2)}, + {"test_namespace": base.EmbedAndKeep(str3)}, + ], + MockEncoder(), ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_w_emb2() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace1": encoded_str1}, - {"test_namespace2": encoded_str2}, - {"test_namespace3": encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace1": {"default_ft": str1}}, + {"test_namespace2": {"default_ft": str2}}, + {"test_namespace3": {"default_ft": str3}}, ] - assert ( - base.embed( - base.Embed( - [ - {"test_namespace1": str1}, - {"test_namespace2": str2}, - {"test_namespace3": str3}, - ] - ), - MockEncoder(), - ) - == expected - ) - expected_embed_and_keep = [ - {"test_namespace1": str1 + " " + encoded_str1}, - {"test_namespace2": str2 + " " + encoded_str2}, - {"test_namespace3": str3 + " " + encoded_str3}, + expected_dense = [ + {"test_namespace1": [4.0, 0.0]}, + {"test_namespace2": [5.0, 0.0]}, + {"test_namespace3": [6.0, 0.0]}, ] - assert ( - base.embed( - base.EmbedAndKeep( - [ - {"test_namespace1": str1}, - {"test_namespace2": str2}, - {"test_namespace3": str3}, - ] - ), - MockEncoder(), - ) - == expected_embed_and_keep + + featurized = base.embed( + base.Embed( + [ + {"test_namespace1": str1}, + {"test_namespace2": str2}, + {"test_namespace3": str3}, + ] + ), + MockEncoder(), + ) + for i in range(len(featurized)): + assert featurized[i].sparse == {} + assert featurized[i].dense == expected_dense[i] + + featurized = base.embed( + base.EmbedAndKeep( + [ + {"test_namespace1": str1}, + {"test_namespace2": str2}, + {"test_namespace3": str3}, + ] + ), + MockEncoder(), ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace": str1}, - {"test_namespace": encoded_str2}, - {"test_namespace": encoded_str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace": {"default_ft": str1}}, + {}, + {}, ] - assert ( - base.embed( - [ - {"test_namespace": str1}, - {"test_namespace": base.Embed(str2)}, - {"test_namespace": base.Embed(str3)}, - ], - MockEncoder(), - ) - == expected + expected_dense = [ + {}, + {"test_namespace": [5.0, 0.0]}, + {"test_namespace": [6.0, 0.0]}, + ] + + featurized = base.embed( + [ + {"test_namespace": str1}, + {"test_namespace": base.Embed(str2)}, + {"test_namespace": base.Embed(str3)}, + ], + MockEncoder(), ) - expected_embed_and_keep = [ - {"test_namespace": str1}, - {"test_namespace": str2 + " " + encoded_str2}, - {"test_namespace": str3 + " " + encoded_str3}, + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + + expected_sparse = [ + {"test_namespace": {"default_ft": str1}}, + {"test_namespace": {"default_ft": str2}}, + {"test_namespace": {"default_ft": str3}}, ] - assert ( - base.embed( - [ - {"test_namespace": str1}, - {"test_namespace": base.EmbedAndKeep(str2)}, - {"test_namespace": base.EmbedAndKeep(str3)}, - ], - MockEncoder(), - ) - == expected_embed_and_keep + featurized = base.embed( + [ + {"test_namespace": str1}, + {"test_namespace": base.EmbedAndKeep(str2)}, + {"test_namespace": base.EmbedAndKeep(str3)}, + ], + MockEncoder(), ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None: - str1 = "test1" - str2 = "test2" - str3 = "test3" - encoded_str1 = base._stringify_embedding(list(encoded_keyword + str1)) - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - encoded_str3 = base._stringify_embedding(list(encoded_keyword + str3)) - expected = [ - {"test_namespace": encoded_str1, "test_namespace2": str1}, - {"test_namespace": encoded_str2, "test_namespace2": str2}, - {"test_namespace": encoded_str3, "test_namespace2": str3}, + str1 = "test" + str2 = "test_" + str3 = "test__" + expected_sparse = [ + {"test_namespace2": {"default_ft": str1}}, + {"test_namespace2": {"default_ft": str2}}, + {"test_namespace2": {"default_ft": str3}}, ] - assert ( - base.embed( - [ - {"test_namespace": base.Embed(str1), "test_namespace2": str1}, - {"test_namespace": base.Embed(str2), "test_namespace2": str2}, - {"test_namespace": base.Embed(str3), "test_namespace2": str3}, - ], - MockEncoder(), - ) - == expected + expected_dense = [ + {"test_namespace": [4.0, 0.0]}, + {"test_namespace": [5.0, 0.0]}, + {"test_namespace": [6.0, 0.0]}, + ] + + featurized = base.embed( + [ + {"test_namespace": base.Embed(str1), "test_namespace2": str1}, + {"test_namespace": base.Embed(str2), "test_namespace2": str2}, + {"test_namespace": base.Embed(str3), "test_namespace2": str3}, + ], + MockEncoder(), ) - expected_embed_and_keep = [ - {"test_namespace": str1 + " " + encoded_str1, "test_namespace2": str1}, - {"test_namespace": str2 + " " + encoded_str2, "test_namespace2": str2}, - {"test_namespace": str3 + " " + encoded_str3, "test_namespace2": str3}, + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] + + expected_sparse = [ + { + "test_namespace": {"default_ft": str1}, + "test_namespace2": {"default_ft": str1}, + }, + { + "test_namespace": {"default_ft": str2}, + "test_namespace2": {"default_ft": str2}, + }, + { + "test_namespace": {"default_ft": str3}, + "test_namespace2": {"default_ft": str3}, + }, ] - assert ( - base.embed( - [ - {"test_namespace": base.EmbedAndKeep(str1), "test_namespace2": str1}, - {"test_namespace": base.EmbedAndKeep(str2), "test_namespace2": str2}, - {"test_namespace": base.EmbedAndKeep(str3), "test_namespace2": str3}, - ], - MockEncoder(), - ) - == expected_embed_and_keep + featurized = base.embed( + [ + {"test_namespace": base.EmbedAndKeep(str1), "test_namespace2": str1}, + {"test_namespace": base.EmbedAndKeep(str2), "test_namespace2": str2}, + {"test_namespace": base.EmbedAndKeep(str3), "test_namespace2": str3}, + ], + MockEncoder(), ) + for i in range(len(featurized)): + assert featurized[i].sparse == expected_sparse[i] + assert featurized[i].dense == expected_dense[i] def test_one_namespace_w_list_of_features_no_emb() -> None: str1 = "test1" str2 = "test2" - expected = [{"test_namespace": [str1, str2]}] - assert base.embed({"test_namespace": [str1, str2]}, MockEncoder()) == expected + expected_sparse = { + "test_namespace_0": {"default_ft": str1}, + "test_namespace_1": {"default_ft": str2}, + } + + featurized = base.embed({"test_namespace": [str1, str2]}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == {} def test_one_namespace_w_list_of_features_w_some_emb() -> None: - str1 = "test1" - str2 = "test2" - encoded_str2 = base._stringify_embedding(list(encoded_keyword + str2)) - expected = [{"test_namespace": [str1, encoded_str2]}] - assert ( - base.embed({"test_namespace": [str1, base.Embed(str2)]}, MockEncoder()) - == expected - ) + str1 = "test" + str2 = "test_" + expected_sparse = {"test_namespace_0": {"default_ft": str1}} + expected_dense = {"test_namespace_1": [5.0, 0.0]} + + featurized = base.embed({"test_namespace": [str1, base.Embed(str2)]}, MockEncoder()) + assert featurized.sparse == expected_sparse + assert featurized.dense == expected_dense def test_nested_list_features_throws() -> None: diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py index e52d1da..59a64e9 100644 --- a/tests/unit_tests/test_utils.py +++ b/tests/unit_tests/test_utils.py @@ -3,7 +3,7 @@ class MockEncoder: def encode(self, to_encode: str) -> str: - return "[encoded]" + to_encode + return [float(len(to_encode)), 0.0] class MockEncoderReturnsList: