From b22fe554b7d12a0484e4583d3f759247d8783a31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felipe=20Ba=C3=B1ados=20Schwerter?= Date: Fri, 1 Nov 2024 17:55:55 +0000 Subject: [PATCH] Test change to definition vectors: May need future reverting but starts a conversation. --- src/morphodict/cvd/__init__.py | 12 +++++++++++- .../management/commands/builddefinitionvectors.py | 7 ++++++- src/morphodict/cvd/news-vectors.ipynb | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/morphodict/cvd/__init__.py b/src/morphodict/cvd/__init__.py index 4384f042a..0b0aa8081 100644 --- a/src/morphodict/cvd/__init__.py +++ b/src/morphodict/cvd/__init__.py @@ -7,6 +7,7 @@ from gensim.models import KeyedVectors from morphodict.lexicon import MORPHODICT_LEXICON_RESOURCE_DIR +from morphodict.relabelling import LABELS logger = logging.getLogger(__name__) @@ -74,6 +75,9 @@ def uniq(l: list) -> list: return list(dict.fromkeys(l)) +# Note: The computation of this vector for definition is memoized by the builddefinitionvectors django command. +# They are not live-computed. If there are changes to this code, please make sure to re-run builddefinitionvectors +# as a command. def vector_for_keys(keyed_vectors, keys: list[str]): """Return the sum of vectors in keyed_vectors for the given keys""" if not keys: @@ -85,7 +89,7 @@ def vector_for_keys(keyed_vectors, keys: list[str]): RE_PUNCTUATION = re.compile(r'[!,.\[\]\(\)\{\};:"/\?]+') -def extract_keyed_words(query: str, keys, already_warned=None): +def extract_keyed_words(query: str, keys, already_warned=None, analysis=[]): """Split query into a list of words that occur in keys already_warned is an optional set, used to reduce debug log verbosity @@ -112,6 +116,12 @@ def extract_keyed_words(query: str, keys, already_warned=None): _warn(piece, f"not found: {word!r} piece {piece!r}", already_warned) else: _warn(word, f"not found: {word!r}", already_warned) + analysis = LABELS.linguistic_short.get_longest( + tuple([t.strip("+") for t in analysis]) + ) + + if analysis: + ret.extend(extract_keyed_words(analysis, keys, already_warned)) return uniq(ret) diff --git a/src/morphodict/cvd/management/commands/builddefinitionvectors.py b/src/morphodict/cvd/management/commands/builddefinitionvectors.py index 79466c383..4ef700395 100644 --- a/src/morphodict/cvd/management/commands/builddefinitionvectors.py +++ b/src/morphodict/cvd/management/commands/builddefinitionvectors.py @@ -47,7 +47,12 @@ def handle(self, output_file, debug_output_file, **options): with create_debug_output(debug_output_file) as debug_output: for d in tqdm(definitions.iterator(), total=count): keys = extract_keyed_words( - d.semantic_definition, news_vectors, unknown_words + d.semantic_definition, + news_vectors, + unknown_words, + analysis=( + d.wordform.raw_analysis[2] if d.wordform.raw_analysis else [] + ), ) debug_output( json.dumps( diff --git a/src/morphodict/cvd/news-vectors.ipynb b/src/morphodict/cvd/news-vectors.ipynb index 3dc62be82..81080da54 100644 --- a/src/morphodict/cvd/news-vectors.ipynb +++ b/src/morphodict/cvd/news-vectors.ipynb @@ -1214,7 +1214,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1228,7 +1228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.4" + "version": "3.10.15" } }, "nbformat": 4,