diff --git a/retvec/__init__.py b/retvec/__init__.py index 49d69b6..7c125ca 100644 --- a/retvec/__init__.py +++ b/retvec/__init__.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,4 +14,4 @@ limitations under the License. """ -__version__ = "0.1.0" +__version__ = "1.0.0" diff --git a/retvec/tf/__init__.py b/retvec/tf/__init__.py index 473367c..6c01262 100644 --- a/retvec/tf/__init__.py +++ b/retvec/tf/__init__.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/dataset/__init__.py b/retvec/tf/dataset/__init__.py index 662a4a4..83f918e 100644 --- a/retvec/tf/dataset/__init__.py +++ b/retvec/tf/dataset/__init__.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/dataset/io.py b/retvec/tf/dataset/io.py index fc84dfd..c5644c8 100644 --- a/retvec/tf/dataset/io.py +++ b/retvec/tf/dataset/io.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/layers/__init__.py b/retvec/tf/layers/__init__.py index aa74759..f6246e8 100644 --- a/retvec/tf/layers/__init__.py +++ b/retvec/tf/layers/__init__.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/layers/binarizer.py b/retvec/tf/layers/binarizer.py index 11b73c2..ca141a6 100644 --- a/retvec/tf/layers/binarizer.py +++ b/retvec/tf/layers/binarizer.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/layers/embedding.py b/retvec/tf/layers/embedding.py index 75b0cce..a96f2cc 100644 --- a/retvec/tf/layers/embedding.py +++ b/retvec/tf/layers/embedding.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,8 @@ import tensorflow as tf from tensorflow import Tensor, TensorShape +from ..utils import RETVEC_MODEL_URLS, download_retvec_saved_model + @tf.keras.utils.register_keras_serializable(package="retvec") class RETVecEmbedding(tf.keras.layers.Layer): @@ -36,7 +38,8 @@ def __init__( Args: model: Path to saved pretrained RETVec model, str or pathlib.Path - object. + object. 'retvec-v1' to use V1 of the pre-trained RETVec word + embedding model. trainable: Whether to make the pretrained RETVec model trainable or to freeze all weights. @@ -93,11 +96,16 @@ def _load_model( """Load pretrained RETVec model. Args: - path: Path to the saved REW* model. + model: Path to saved pretrained RETVec model. Either a pre-defined + RETVec model name, str or pathlib.Path. Returns: The pretrained RETVec model, trainable set to `self.trainable`. """ + path_str = str(path) + if path_str in RETVEC_MODEL_URLS: + path = download_retvec_saved_model(path_str) + model = tf.keras.models.load_model(path) model.trainable = self.trainable model.compile("adam", "mse") diff --git a/retvec/tf/layers/integerizer.py b/retvec/tf/layers/integerizer.py index 2808c32..f1d3af8 100644 --- a/retvec/tf/layers/integerizer.py +++ b/retvec/tf/layers/integerizer.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/layers/tokenizer.py b/retvec/tf/layers/tokenizer.py index d3c1823..1a4987f 100644 --- a/retvec/tf/layers/tokenizer.py +++ b/retvec/tf/layers/tokenizer.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -79,7 +79,9 @@ def __init__( `sequence_length` words. model: Path to saved pretrained RETVec model, str or pathlib.Path - object. + object. 'retvec-v1' to use V1 of the pre-trained RETVec word + embedding model, None to use the default RETVec character + encoding. trainable: Whether to make the pretrained RETVec model trainable or to freeze all weights. diff --git a/retvec/tf/models/__init__.py b/retvec/tf/models/__init__.py index 662a4a4..83f918e 100644 --- a/retvec/tf/models/__init__.py +++ b/retvec/tf/models/__init__.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/models/gau.py b/retvec/tf/models/gau.py index f04bd5c..1a13a86 100644 --- a/retvec/tf/models/gau.py +++ b/retvec/tf/models/gau.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/models/layers.py b/retvec/tf/models/layers.py index 184ffad..aafccb8 100644 --- a/retvec/tf/models/layers.py +++ b/retvec/tf/models/layers.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/models/outputs.py b/retvec/tf/models/outputs.py index b9929b2..09e84ef 100644 --- a/retvec/tf/models/outputs.py +++ b/retvec/tf/models/outputs.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/models/positional_embeddings.py b/retvec/tf/models/positional_embeddings.py index 50b3ce1..58ab0f3 100644 --- a/retvec/tf/models/positional_embeddings.py +++ b/retvec/tf/models/positional_embeddings.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/models/retvec_base.py b/retvec/tf/models/retvec_base.py index b8aa34f..1b8fd50 100644 --- a/retvec/tf/models/retvec_base.py +++ b/retvec/tf/models/retvec_base.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/models/retvec_large.py b/retvec/tf/models/retvec_large.py index 6dfd41a..9960ff6 100644 --- a/retvec/tf/models/retvec_large.py +++ b/retvec/tf/models/retvec_large.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/optimizers/__init__.py b/retvec/tf/optimizers/__init__.py index e78f16e..5a8017f 100644 --- a/retvec/tf/optimizers/__init__.py +++ b/retvec/tf/optimizers/__init__.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/optimizers/warmup_cosine_decay.py b/retvec/tf/optimizers/warmup_cosine_decay.py index 20342e1..b124179 100644 --- a/retvec/tf/optimizers/warmup_cosine_decay.py +++ b/retvec/tf/optimizers/warmup_cosine_decay.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/retvec/tf/utils.py b/retvec/tf/utils.py index 8d0b125..49ffeee 100644 --- a/retvec/tf/utils.py +++ b/retvec/tf/utils.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,27 @@ limitations under the License. """ +import os +from pathlib import Path +from typing import Optional import tensorflow as tf +RETVEC_MODEL_URLS = { + "retvec-v1": "https://storage.googleapis.com/tensorflow/keras-applications/retvec-v1" +} + +# TODO (marinazh): we should download RETVec model weights instead of SavedModel files +RETVEC_COMPONENTS_HASHES = { + "retvec-v1": { + "fingerprint.pb": "5c3991599c293ba653c55e8cceae8e10815eeedea6aff75a64905cd71587d4c1", + "keras_metadata.pb": "e87e8b660ef66f8a058c4c0aa8bfaa8b683bcd4669c21e4bf71055148f8c6afc", + "saved_model.pb": "337c8e91c92946513d127b256f2872a497545186c4d2c2c09afc7d76b55454b7", + "variables.data-00000-of-00001": "22d4760b452fe8110ef2fa96b3d84186372f5259b8f6c4041a05c3ab58d93d37", + "variables.index": "431d19b7426b939c9834bb7d55d515a4ee7d7a6cda78ef0bf7b8ba03e67e480b", + } +} + def tf_cap_memory(): """Avoid TF to hog memory before needing it""" @@ -38,3 +56,35 @@ def clone_initializer(initializer: tf.keras.initializers.Initializer): ): return initializer.__class__.from_config(initializer.get_config()) return initializer + + +def download_retvec_saved_model( + model_name: str = "retvec-v1", + cache_dir: str = "~/.keras/", + model_cache_subdir: str = "retvec-v1", +): + if model_name not in RETVEC_MODEL_URLS: + raise ValueError(f"{model_name} is not a valid RETVec model name.") + + model_url = RETVEC_MODEL_URLS[model_name] + model_cache_subdir_variables = f"{model_cache_subdir}/variables" + + # download model components + retvec_components = RETVEC_COMPONENTS_HASHES[model_name] + for component_name in retvec_components.keys(): + if "variables" in component_name: + origin = f"{model_url}/variables/{component_name}" + cache_subdir = model_cache_subdir_variables + else: + origin = f"{model_url}/{component_name}" + cache_subdir = model_cache_subdir + + tf.keras.utils.get_file( + origin=origin, + extract=True, + cache_subdir=cache_subdir, + file_hash=retvec_components[component_name], + ) + + retvec_model_dir = cache_dir + model_cache_subdir + return Path(retvec_model_dir).expanduser() diff --git a/setup.py b/setup.py index 6475624..c742e6f 100644 --- a/setup.py +++ b/setup.py @@ -71,8 +71,6 @@ def get_version(rel_path): classifiers=[ "Development Status :: 3 - Alpha", "Environment :: Console", - "Framework :: TensorFlow", - "Framework :: Torch", "License :: OSI Approved :: Apache Software License", "Intended Audience :: Science/Research", "Programming Language :: Python :: 3", diff --git a/tests/conftest.py b/tests/conftest.py index 6377c54..dd6fa0c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/tests/tf/layers/test_binarizer.py b/tests/tf/layers/test_binarizer.py index 38f7f4e..66f51b5 100644 --- a/tests/tf/layers/test_binarizer.py +++ b/tests/tf/layers/test_binarizer.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/tests/tf/layers/test_embedding.py b/tests/tf/layers/test_embedding.py index fdf56da..5b8f50e 100644 --- a/tests/tf/layers/test_embedding.py +++ b/tests/tf/layers/test_embedding.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,11 +18,20 @@ from retvec.tf.layers import RETVecBinarizer, RETVecEmbedding -TEST_EMB_SIZE = 16 +TEST_EMB_SIZE = 256 +TEST_WORD_LENGTH = 16 +TEST_CHAR_ENCODING_SIZE = 24 +TEST_INPUTS = [ + tf.constant(["Testing😀"]), + tf.constant(["Testing😀", "Testing😀"]), + tf.constant(["Testing a very long string as input"]), +] def create_retvec_embedding(tmp_path): - i = tf.keras.layers.Input((16, 32), dtype=tf.float32) + i = tf.keras.layers.Input( + (TEST_WORD_LENGTH, TEST_CHAR_ENCODING_SIZE), dtype=tf.float32 + ) x = tf.keras.layers.Flatten()(i) o = tf.keras.layers.Dense(TEST_EMB_SIZE)(x) model = tf.keras.models.Model(i, o) @@ -36,32 +45,23 @@ def create_retvec_embedding(tmp_path): def test_rewnet_model(tmp_path): embedding_model = create_retvec_embedding(tmp_path) - binarizer = RETVecBinarizer(word_length=16, encoding_size=32) - - test_inputs = [ - tf.constant(["Testing😀"]), - tf.constant(["Testing😀", "Testing😀"]), - tf.constant(["Testing a very long string as input"]), - ] + binarizer = RETVecBinarizer( + word_length=TEST_WORD_LENGTH, encoding_size=TEST_CHAR_ENCODING_SIZE + ) - for test_input in test_inputs: + for test_input in TEST_INPUTS: embeddings = embedding_model(binarizer.binarize(test_input)) assert embeddings.shape == (test_input.shape[0], TEST_EMB_SIZE) def test_2d_inputs(tmp_path): - i = tf.keras.layers.Input((16, 32), dtype=tf.float32) - x = tf.keras.layers.Flatten()(i) - o = tf.keras.layers.Dense(16)(x) - model = tf.keras.models.Model(i, o) - - save_path = tmp_path / "test_retvec_embedding" - model.save(save_path) - - embedding_model = RETVecEmbedding(str(save_path)) + embedding_model = create_retvec_embedding(tmp_path) test_input = tf.random.uniform( - (2, 3, 16, 32), minval=0, maxval=2, dtype=tf.int32 + (2, 3, TEST_WORD_LENGTH, TEST_CHAR_ENCODING_SIZE), + minval=0, + maxval=2, + dtype=tf.int32, ) test_input = tf.cast(test_input, dtype=tf.float32) embeddings = embedding_model(test_input) @@ -70,24 +70,22 @@ def test_2d_inputs(tmp_path): def test_binarizer_embedding_model(tmp_path): i = tf.keras.layers.Input((1,), dtype=tf.string) - x = RETVecBinarizer(word_length=16, encoding_size=32)(i) + x = RETVecBinarizer( + word_length=TEST_WORD_LENGTH, encoding_size=TEST_CHAR_ENCODING_SIZE + )(i) o = create_retvec_embedding(tmp_path)(x) model = tf.keras.models.Model(i, o) - test_inputs = [ - tf.constant(["Testing😀"]), - tf.constant(["Testing😀", "Testing😀"]), - tf.constant(["Testing a very long string as input"]), - ] - - for test_input in test_inputs: + for test_input in TEST_INPUTS: embeddings = model(test_input) assert embeddings.shape == (test_input.shape[0], TEST_EMB_SIZE) def test_binarizer_embedding_model_2d(tmp_path): i = tf.keras.layers.Input((3,), dtype=tf.string) - x = RETVecBinarizer(word_length=16, encoding_size=32)(i) + x = RETVecBinarizer( + word_length=TEST_WORD_LENGTH, encoding_size=TEST_CHAR_ENCODING_SIZE + )(i) o = create_retvec_embedding(tmp_path)(x) model = tf.keras.models.Model(i, o) @@ -104,10 +102,29 @@ def test_binarizer_embedding_model_2d(tmp_path): def test_serialization(tmp_path): embedding_model = create_retvec_embedding(tmp_path) - i = tf.keras.layers.Input((16, 32), dtype=tf.float32) + i = tf.keras.layers.Input( + (TEST_WORD_LENGTH, TEST_CHAR_ENCODING_SIZE), dtype=tf.float32 + ) x = embedding_model(i) model = tf.keras.models.Model(i, x) save_path = tmp_path / "test_retvec_embedding_serialization" model.save(save_path) tf.keras.models.load_model(save_path) + + +def test_default_embedding_model(tmp_path): + embedding_size = 256 + binarizer = RETVecBinarizer( + word_length=TEST_WORD_LENGTH, encoding_size=TEST_CHAR_ENCODING_SIZE + ) + + i = tf.keras.layers.Input( + (TEST_WORD_LENGTH, TEST_CHAR_ENCODING_SIZE), dtype=tf.float32 + ) + x = RETVecEmbedding(model="retvec-v1")(i) + model = tf.keras.models.Model(i, x) + + for test_input in TEST_INPUTS: + embeddings = model(binarizer.binarize(test_input)) + assert embeddings.shape == (test_input.shape[0], embedding_size) diff --git a/tests/tf/layers/test_integerizer.py b/tests/tf/layers/test_integerizer.py index 6a824cd..6f5860a 100644 --- a/tests/tf/layers/test_integerizer.py +++ b/tests/tf/layers/test_integerizer.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/tests/tf/layers/test_tokenizer.py b/tests/tf/layers/test_tokenizer.py index 0b0faec..9de5c9c 100644 --- a/tests/tf/layers/test_tokenizer.py +++ b/tests/tf/layers/test_tokenizer.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,30 +20,15 @@ SEQUENCE_LENGTH = 128 WORD_LENGTH = 16 -CHAR_ENCODING_SIZE = 32 -EMBEDDING_SIZE = 128 - - -def create_and_save_retvec_embedding(tmp_path): - i = tf.keras.layers.Input( - (WORD_LENGTH, CHAR_ENCODING_SIZE), dtype=tf.float32 - ) - x = tf.keras.layers.Flatten()(i) - o = tf.keras.layers.Dense(EMBEDDING_SIZE)(x) - model = tf.keras.models.Model(i, o) - - save_path = tmp_path / "test_retvec_embedding" - model.save(save_path) - return str(save_path) +CHAR_ENCODING_SIZE = 24 +RETVEC_MODEL = "retvec-v1" def test_graph_mode_with_model(tmp_path): - model_path = create_and_save_retvec_embedding(tmp_path) - i = tf.keras.layers.Input((1,), dtype=tf.string) x = RETVecTokenizer( sequence_length=SEQUENCE_LENGTH, - model=model_path, + model=RETVEC_MODEL, word_length=WORD_LENGTH, char_encoding_size=CHAR_ENCODING_SIZE, )(i) @@ -59,28 +44,25 @@ def test_graph_mode_with_model(tmp_path): assert embeddings.shape == ( test_input.shape[0], SEQUENCE_LENGTH, - EMBEDDING_SIZE, + 256, ) def test_eager_mode_with_model(tmp_path): - model_path = create_and_save_retvec_embedding(tmp_path) - tokenizer = RETVecTokenizer( - model=model_path, + model=RETVEC_MODEL, sequence_length=SEQUENCE_LENGTH, word_length=WORD_LENGTH, char_encoding_size=CHAR_ENCODING_SIZE, ) - assert tokenizer.embedding_size == EMBEDDING_SIZE s = "Testing😀 a full sentence" embeddings = tokenizer.tokenize(tf.constant(s)) - assert embeddings.shape == [SEQUENCE_LENGTH, EMBEDDING_SIZE] + assert embeddings.shape == [SEQUENCE_LENGTH, tokenizer.embedding_size] embeddings = tokenizer.tokenize(tf.constant([s, s, s])) - assert embeddings.shape == [3, SEQUENCE_LENGTH, EMBEDDING_SIZE] + assert embeddings.shape == [3, SEQUENCE_LENGTH, tokenizer.embedding_size] def test_graph_mode_no_model(): @@ -146,11 +128,9 @@ def test_standardize(): def test_tfds_map_tokenize(tmp_path): - model_path = create_and_save_retvec_embedding(tmp_path) - - for model in [None, model_path]: + for model_path in [None, RETVEC_MODEL]: tokenizer = RETVecTokenizer( - model=model, + model=model_path, sequence_length=SEQUENCE_LENGTH, word_length=WORD_LENGTH, char_encoding_size=CHAR_ENCODING_SIZE, @@ -172,12 +152,10 @@ def test_tfds_map_tokenize(tmp_path): def test_serialization(tmp_path): - model_path = create_and_save_retvec_embedding(tmp_path) - - for model in [None, model_path]: + for model_path in [None, RETVEC_MODEL]: i = tf.keras.layers.Input((1,), dtype=tf.string) x = RETVecTokenizer( - model=model, + model=model_path, sequence_length=SEQUENCE_LENGTH, word_length=WORD_LENGTH, char_encoding_size=CHAR_ENCODING_SIZE, diff --git a/tests/tf/models/test_models.py b/tests/tf/models/test_models.py index 78fef18..aa88a80 100644 --- a/tests/tf/models/test_models.py +++ b/tests/tf/models/test_models.py @@ -1,5 +1,5 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/training/README.md b/training/README.md index e0f7d54..fbcc16c 100644 --- a/training/README.md +++ b/training/README.md @@ -1,7 +1,6 @@ # RetVec Training -This directory contains the scripts needed to train RetVec models as described -in [Fixme] +This directory contains the scripts needed to train RETVec models. ## Usage diff --git a/training/train_tf_retvec_models.py b/training/train_tf_retvec_models.py index 63b91d8..5ce1ccd 100644 --- a/training/train_tf_retvec_models.py +++ b/training/train_tf_retvec_models.py @@ -1,9 +1,12 @@ """ - Copyright 2021 Google LLC + Copyright 2023 Google LLC + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + https://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.