From 1057419e1b17d2f1c49e496bb73dd9d22ce103a2 Mon Sep 17 00:00:00 2001 From: Eric Soroos Date: Thu, 29 Feb 2024 20:04:01 +0000 Subject: [PATCH 1/6] performance tweak don't recalculate embeddings for similarity search, retrieve them from the db --- ckanext/embeddings/actions.py | 17 ++++++++--------- ckanext/embeddings/backends.py | 1 - 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/ckanext/embeddings/actions.py b/ckanext/embeddings/actions.py index 83f6dec..07231d2 100644 --- a/ckanext/embeddings/actions.py +++ b/ckanext/embeddings/actions.py @@ -12,18 +12,17 @@ def package_similar_show(context, data_dict): except ValueError: raise toolkit.ValidationError(f"Wrong value for limit paramater: {limit}") + field_name = toolkit.config.get("ckanext.embeddings.solr_vector_field_name", "vector") + try: - dataset_dict = toolkit.get_action("package_show")( - {"ignore_auth": True}, {"id": dataset_id} - ) - except toolkit.ObjectNotFound: + vectors = toolkit.get_action("package_search")( + {"ignore_auth": True}, {"fq": f"(id:{dataset_id} OR name:{dataset_id})", 'fl':f"{field_name},id"} + )['results'] + dataset_dict = vectors.pop() + dataset_embedding = dataset_dict[field_name] + except IndexError: raise toolkit.ObjectNotFound(f"Dataset not found: {dataset_id}") - backend = get_embeddings_backend() - dataset_embedding = backend.get_embedding_for_dataset(dataset_dict) - - field_name = toolkit.config.get("ckanext.embeddings.solr_vector_field_name", "vector") - search_params = {} search_params["defType"] = "lucene" search_params["q"] = f"{{!knn f={field_name} topK={limit}}}{list(dataset_embedding)}" diff --git a/ckanext/embeddings/backends.py b/ckanext/embeddings/backends.py index 8117177..501531c 100644 --- a/ckanext/embeddings/backends.py +++ b/ckanext/embeddings/backends.py @@ -13,7 +13,6 @@ class BaseEmbeddingsBackend: def get_dataset_values(self, dataset_dict): - if dataset_dict.get("notes"): return dataset_dict["title"] + " " + dataset_dict["notes"] else: From d8fa2ff38b4b3a188b3f7f6dba6e306abb45a174 Mon Sep 17 00:00:00 2001 From: Eric Soroos Date: Thu, 29 Feb 2024 20:08:54 +0000 Subject: [PATCH 2/6] performance -- lazy loading * lazy load and instantiate the backend and embeddings * cache the embeddings_backend in the module so that non-plugin uses use the same instance * property on plugin is required to avoid lazy app loading on uwsgi with multiple processes --- ckanext/embeddings/backends.py | 16 +++++++++++----- ckanext/embeddings/plugin.py | 13 +++++++------ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/ckanext/embeddings/backends.py b/ckanext/embeddings/backends.py index 501531c..203cac3 100644 --- a/ckanext/embeddings/backends.py +++ b/ckanext/embeddings/backends.py @@ -105,15 +105,21 @@ def _load_embeddings_backends(): embeddings_backends[ep.name] = ep.load() log.debug(f"Registering Embeddings Backend: {ep.name}") +_embeddings_backend = None def get_embeddings_backend(): - # TODO: config declaration - + global _embeddings_backend backend = toolkit.config.get("ckanext.embeddings.backend", "sentence_transformers") log.debug(f"Using Embeddings Backend: {backend}") - return embeddings_backends[backend]() - + import time + start = time.time() + try: + _load_embeddings_backends() + if _embeddings_backend is None: + _embeddings_backend = embeddings_backends[backend]() + return _embeddings_backend + finally: + log.debug("loading embeddings took: %.3f sec", time.time()-start) -_load_embeddings_backends() diff --git a/ckanext/embeddings/plugin.py b/ckanext/embeddings/plugin.py index 43a3a7a..4629ea8 100644 --- a/ckanext/embeddings/plugin.py +++ b/ckanext/embeddings/plugin.py @@ -20,14 +20,17 @@ class EmbeddingPlugin(plugins.SingletonPlugin): plugins.implements(plugins.ITemplateHelpers) plugins.implements(plugins.IPackageController, inherit=True) - backend = None + _backend = None + + @property + def backend(self): + if self._backend is None: + self._backend = get_embeddings_backend() + return self._backend # IConfigurer def update_config(self, config): - - self.backend = get_embeddings_backend() - toolkit.add_template_directory(config, "templates") toolkit.add_resource("assets", "ckanext-embeddings") @@ -57,8 +60,6 @@ def before_dataset_index(self, dataset_dict): dataset_id = dataset_dict["id"] - if not self.backend: - self.backend = get_embeddings_backend() dataset_embedding = self.backend.get_embedding_for_dataset(dataset_dict) if dataset_embedding is not None: From 9803e4e378b42af1c47dfb9c52db739b472e1cc0 Mon Sep 17 00:00:00 2001 From: Eric Soroos Date: Thu, 29 Feb 2024 20:09:17 +0000 Subject: [PATCH 3/6] compatibility -- python 3.8+3.9 --- ckanext/embeddings/backends.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ckanext/embeddings/backends.py b/ckanext/embeddings/backends.py index 203cac3..ec372c0 100644 --- a/ckanext/embeddings/backends.py +++ b/ckanext/embeddings/backends.py @@ -101,7 +101,12 @@ def create_embedding(self, values): def _load_embeddings_backends(): from importlib.metadata import entry_points - for ep in entry_points(group="ckanext.embeddings.backends"): + try: + eps = entry_points(group="ckanext.embeddings.backends") + except: + # python 3.9/3.8 + eps = (ep for ep in entry_points()['ckanext.embeddings.backends']) + for ep in eps: embeddings_backends[ep.name] = ep.load() log.debug(f"Registering Embeddings Backend: {ep.name}") From 539a6b3ed986f6dc7ac428bbcf4af94cda29e06d Mon Sep 17 00:00:00 2001 From: Eric Soroos Date: Thu, 29 Feb 2024 20:10:04 +0000 Subject: [PATCH 4/6] performance -- allows download/loading of models to be forced from cli --- ckanext/embeddings/cli.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ckanext/embeddings/cli.py b/ckanext/embeddings/cli.py index ee038e0..9746004 100644 --- a/ckanext/embeddings/cli.py +++ b/ckanext/embeddings/cli.py @@ -66,3 +66,10 @@ def search(query: str, limit: int): for r in result["results"]: print(f"{r['id']} - {r['title']}") + + +@embeddings.command() +def load(): + """ Loads the backend embeddings, filling whatever cache is required, downloading models, etc """ + backend = get_embeddings_backend() + From 73d98de369311ca2055c9aec4fe59dce2fe4b8c5 Mon Sep 17 00:00:00 2001 From: Eric Soroos Date: Thu, 29 Feb 2024 20:12:07 +0000 Subject: [PATCH 5/6] ckan 2.9 compatibility --- ckanext/embeddings/plugin.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ckanext/embeddings/plugin.py b/ckanext/embeddings/plugin.py index 4629ea8..af34975 100644 --- a/ckanext/embeddings/plugin.py +++ b/ckanext/embeddings/plugin.py @@ -70,6 +70,9 @@ def before_dataset_index(self, dataset_dict): return dataset_dict + def before_index(self, dataset_dict): + return self.before_dataset_index(dataset_dict) + def before_dataset_search(self, search_params): extras = search_params.get("extras", {}) if isinstance(extras, str): @@ -102,3 +105,6 @@ def before_dataset_search(self, search_params): search_params["q"] = f"{{!knn f={field_name} topK={rows}}}{list(embedding)}" return search_params + + def before_search(self, search_params): + return self.before_dataset_search(search_params) From aa47612a0edb6a29bf55784e26697c85f3686e24 Mon Sep 17 00:00:00 2001 From: Eric Soroos Date: Tue, 5 Mar 2024 10:35:00 +0000 Subject: [PATCH 6/6] add logging, remove backends import --- ckanext/embeddings/actions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/embeddings/actions.py b/ckanext/embeddings/actions.py index 07231d2..75a5068 100644 --- a/ckanext/embeddings/actions.py +++ b/ckanext/embeddings/actions.py @@ -1,7 +1,7 @@ from ckan.plugins import toolkit -from ckanext.embeddings.backends import get_embeddings_backend - +import logging +log = logging.getLogger(__name__) @toolkit.side_effect_free def package_similar_show(context, data_dict):