Skip to content

Commit

Permalink
Merge pull request #22 from UAlbertaALTLab/semantic_searchNEW
Browse files Browse the repository at this point in the history
Semantic search new
  • Loading branch information
M1Al3x authored Nov 6, 2023
2 parents 21d209c + 4974e6d commit bd01aee
Show file tree
Hide file tree
Showing 10 changed files with 947 additions and 935 deletions.
2 changes: 0 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pytest-cov = "*"
pytest-datadir = "*"
pytest-mypy = "*"
hypothesis = {version = "~=4.34", extras = ["django"]}
codecov = "*"
pysnooper = "*"
python-levenshtein = "*"
django-debug-toolbar = "*"
Expand All @@ -21,7 +20,6 @@ mypy = "*"
pytest-env = "*"
jupyterlab = "*"
appnope = "*"
nb_black = "*"
statsmodels = "*"
pandas-stubs = "*"
pytest-pythonpath = "*"
Expand Down
1,847 changes: 933 additions & 914 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/API/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

7 changes: 7 additions & 0 deletions src/API/search/cvd_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def do_cvd_search(search_run: SearchRun):

search_run.add_verbose_message(cvd_extracted_keys=keys)
query_vector = vector_for_keys(google_news_vectors(), keys)
print(query_vector)

try:
closest = definition_vectors().similar_by_vector(query_vector, 50)
Expand All @@ -41,20 +42,25 @@ def do_cvd_search(search_run: SearchRun):
wordform_queries = [
cvd_key_to_wordform_query(similarity) for similarity, weight in closest
]
print(wordform_queries)
similarities = [similarity for cvd_key, similarity in closest]
# Get all possible wordforms in one big query. We will select more than we
# need, then filter it down later, but this will have to do until we get
# better homonym handling.
print(Wordform.objects.count())
wordform_results = Wordform.objects.filter(
text__in=set(wf["text"] for wf in wordform_queries)
)
print(wordform_results)

# Now match back up
wordforms_by_text = {
text: list(wordforms)
for text, wordforms in itertools.groupby(wordform_results, key=lambda x: x.text)
}

print(wordforms_by_text)

for similarity, wordform_query in zip(similarities, wordform_queries):
# gensim uses the terminology, similarity = 1 - distance. Its
# similarity is a number from 0 to 1, with more similar items having
Expand All @@ -63,6 +69,7 @@ def do_cvd_search(search_run: SearchRun):
distance = 1 - similarity

wordforms_for_query = wordforms_by_text.get(wordform_query["text"], None)
print(wordforms_for_query)
if wordforms_for_query is None:
logger.warning(
f"Wordform {wordform_query['text']} not found in CVD; mismatch between definition vector model file and definitions in database?"
Expand Down
2 changes: 1 addition & 1 deletion src/crkeng/res/fst/.gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1 @@
*.fomabin filter=lfs diff=lfs merge=lfs -text

Binary file not shown.
Binary file not shown.
Binary file modified src/crkeng/res/fst/transcriptor-eng-phrase2crk-features.fomabin
Binary file not shown.
18 changes: 2 additions & 16 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,29 +389,15 @@ def get_recordings_from_paradigm(paradigm, request):

if request.COOKIES.get("synthesized_audio_in_paradigm") == "yes":
speech_db_eq.insert(0, "synth")
query_terms = [
query_terms[0],
query_terms[1],
query_terms[2],
query_terms[3],
query_terms[4],
]
query_terms = [query_terms[0], query_terms[1], query_terms[2], query_terms[3],query_terms[4] ]
for search_terms in divide_chunks(query_terms, 30):
for source in speech_db_eq:
temp.append(None)
index = 0
for search_terms in divide_chunks(query_terms, 30):
for source in speech_db_eq:
url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
x = threading.Thread(
target=get_recordings_from_url,
args=(
search_terms,
url,
temp,
index,
),
)
x = threading.Thread(target=get_recordings_from_url, args=(search_terms, url, temp, index,))
threads.append(x)
x.start()
index += 1
Expand Down
5 changes: 3 additions & 2 deletions src/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def word_details_api(request, slug: str):
if matched_recs:
recordings.extend(matched_recs)
print("here we go")
if paradigm is not None:
if paradigm is not None:
FST_DIR = settings.BASE_DIR / "res" / "fst"
paradigm_manager = ParadigmManager(
layout_directory=settings.LAYOUTS_DIR,
Expand All @@ -152,6 +152,7 @@ def word_details_api(request, slug: str):
}
}


return Response(content)


Expand Down Expand Up @@ -247,6 +248,7 @@ def search_api(request):
return Response(context)



def make_wordnet_format(wn_class):
"""
Accepts: wn_class of format (n) bear 1
Expand Down Expand Up @@ -296,7 +298,6 @@ def wordnet_api(request, classification):

return Response(context)


def relabelInflectionalCategory(ic):
with open(Path(settings.RESOURCES_DIR / "altlabel.tsv")) as f:
labels = Relabelling.from_tsv(f)
Expand Down

0 comments on commit bd01aee

Please sign in to comment.