Skip to content

Commit

Permalink
Test change to definition vectors: May need future reverting but star…
Browse files Browse the repository at this point in the history
…ts a conversation.
  • Loading branch information
fbanados committed Nov 1, 2024
1 parent ab42669 commit b22fe55
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 4 deletions.
12 changes: 11 additions & 1 deletion src/morphodict/cvd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from gensim.models import KeyedVectors

from morphodict.lexicon import MORPHODICT_LEXICON_RESOURCE_DIR
from morphodict.relabelling import LABELS

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -74,6 +75,9 @@ def uniq(l: list) -> list:
return list(dict.fromkeys(l))


# Note: The computation of this vector for definition is memoized by the builddefinitionvectors django command.
# They are not live-computed. If there are changes to this code, please make sure to re-run builddefinitionvectors
# as a command.
def vector_for_keys(keyed_vectors, keys: list[str]):
"""Return the sum of vectors in keyed_vectors for the given keys"""
if not keys:
Expand All @@ -85,7 +89,7 @@ def vector_for_keys(keyed_vectors, keys: list[str]):
RE_PUNCTUATION = re.compile(r'[!,.\[\]\(\)\{\};:"/\?]+')


def extract_keyed_words(query: str, keys, already_warned=None):
def extract_keyed_words(query: str, keys, already_warned=None, analysis=[]):
"""Split query into a list of words that occur in keys
already_warned is an optional set, used to reduce debug log verbosity
Expand All @@ -112,6 +116,12 @@ def extract_keyed_words(query: str, keys, already_warned=None):
_warn(piece, f"not found: {word!r} piece {piece!r}", already_warned)
else:
_warn(word, f"not found: {word!r}", already_warned)
analysis = LABELS.linguistic_short.get_longest(
tuple([t.strip("+") for t in analysis])
)

if analysis:
ret.extend(extract_keyed_words(analysis, keys, already_warned))

return uniq(ret)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,12 @@ def handle(self, output_file, debug_output_file, **options):
with create_debug_output(debug_output_file) as debug_output:
for d in tqdm(definitions.iterator(), total=count):
keys = extract_keyed_words(
d.semantic_definition, news_vectors, unknown_words
d.semantic_definition,
news_vectors,
unknown_words,
analysis=(
d.wordform.raw_analysis[2] if d.wordform.raw_analysis else []
),
)
debug_output(
json.dumps(
Expand Down
4 changes: 2 additions & 2 deletions src/morphodict/cvd/news-vectors.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -1228,7 +1228,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down

0 comments on commit b22fe55

Please sign in to comment.