Skip to content

Commit

Permalink
Fix semantic search
Browse files Browse the repository at this point in the history
  • Loading branch information
M1Al3x committed Nov 1, 2023
1 parent 58663bb commit 211c6c7
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 56 deletions.
6 changes: 3 additions & 3 deletions src/API/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ def search_with_affixes(
rw_index: str,
rw_domain: str,
wn_synset: str,
include_auto_definitions=False,
inflect_english_phrases=False,
include_auto_definitions=True,
inflect_english_phrases=True,
):
"""
Search for wordforms matching:
Expand All @@ -28,7 +28,7 @@ def search_with_affixes(


def simple_search(
query: str, include_auto_definitions=False, inflect_english_phrases=False
query: str, include_auto_definitions=True, inflect_english_phrases=True
):
"""
Search, trying to match full wordforms or keywords within definitions.
Expand Down
4 changes: 2 additions & 2 deletions src/API/search/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ def __init__(
rw_index: str,
rw_domain: str,
wn_synset: str,
include_auto_definitions=None,
include_auto_definitions=True,
):
self.query = Query(query)
self.rw_index = rw_index
self.rw_domain = rw_domain
self.wn_synset = wn_synset
self.include_auto_definitions = first_non_none_value(
self.query.auto, include_auto_definitions, default=False
self.query.auto, include_auto_definitions, default=True
)
self._results = {}
self._verbose_messages = []
Expand Down
1 change: 0 additions & 1 deletion src/API/search/cvd_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def do_cvd_search(search_run: SearchRun):
cvd_key_to_wordform_query(similarity) for similarity, weight in closest
]
similarities = [similarity for cvd_key, similarity in closest]

# Get all possible wordforms in one big query. We will select more than we
# need, then filter it down later, but this will have to do until we get
# better homonym handling.
Expand Down
2 changes: 1 addition & 1 deletion src/API/search/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def serialize_wordform(


def serialize_definitions(
definitions, include_auto_definitions=False, dict_source=None
definitions, include_auto_definitions=True, dict_source=None
):
ret = []
for definition in definitions:
Expand Down
4 changes: 2 additions & 2 deletions src/API/search/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def search(
rw_domain: str,
wn_synset: str,
include_affixes=True,
include_auto_definitions=False,
inflect_english_phrases=False
include_auto_definitions=True,
inflect_english_phrases=True
) -> SearchRun:
"""
Perform an actual search, using the provided options.
Expand Down
2 changes: 1 addition & 1 deletion src/API/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def click_in_text(request) -> HttpResponse:
elif q == "":
return HttpResponseBadRequest("query param q is an empty string")

results = simple_search(q, include_auto_definitions=False)
results = simple_search(q, include_auto_definitions=True)

response = {"results": results}

Expand Down
43 changes: 34 additions & 9 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
from hfst_optimized_lookup import Analysis

import threading
from analysis import RichAnalysis, rich_analyze_strict

"""
Helper functions for the views file.
"""

from urllib.parse import ParseResult, urlencode, urlunparse

import time
import urllib
import logging
from typing import Optional
Expand Down Expand Up @@ -369,6 +369,9 @@ def should_inflect_phrases(request):


def get_recordings_from_paradigm(paradigm, request):
start = time.time()
threads = []
temp = []
if request.COOKIES.get("paradigm_audio") == "no":
return paradigm

Expand All @@ -386,21 +389,36 @@ def get_recordings_from_paradigm(paradigm, request):

if request.COOKIES.get("synthesized_audio_in_paradigm") == "yes":
speech_db_eq.insert(0, "synth")

query_terms = [query_terms[0], query_terms[1], query_terms[2], query_terms[3],query_terms[4] ]
for search_terms in divide_chunks(query_terms, 30):
for source in speech_db_eq:
temp.append(None)
index = 0
for search_terms in divide_chunks(query_terms, 30):
for source in speech_db_eq:
url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
matched_recordings.update(get_recordings_from_url(search_terms, url))

x = threading.Thread(target=get_recordings_from_url, args=(search_terms, url, temp, index,))
threads.append(x)
x.start()
index += 1

for i in range(len(threads)):
threads[i].join()

for item in temp:
matched_recordings.update(item)
end = time.time()
print(end - start)
paradigm = paradigm.bulk_add_recordings(matched_recordings)

return paradigm


def get_recordings_from_url(search_terms, url):
def get_recordings_from_url(search_terms, url, temp, index):
matched_recordings = {}
query_params = [("q", term) for term in search_terms]
print(url)
response = requests.get(url + "?" + urllib.parse.urlencode(query_params))
print("whyyyyyy")
if response.status_code == 200:
recordings = response.json()

Expand All @@ -409,13 +427,20 @@ def get_recordings_from_url(search_terms, url):
matched_recordings[entry] = {}
matched_recordings[entry]["recording_url"] = recording["recording_url"]
matched_recordings[entry]["speaker"] = recording["speaker"]

return matched_recordings
print("ahahahahahah")
print(matched_recordings)
temp[index] = matched_recordings
print("...............................................")
print(temp[index])
print("...............................................")


def get_recordings_from_url_with_speaker_info(search_terms, url):
query_params = [("q", term) for term in search_terms]
print(query_params)
print(url)
response = requests.get(url + "?" + urllib.parse.urlencode(query_params))
print("DHJJJDJJDJDJDJDJ")
if response.status_code == 200:
recordings = response.json()
return recordings["matched_recordings"]
Expand Down
49 changes: 12 additions & 37 deletions src/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import logging
from pathlib import Path

import time
from typing import Dict, Literal
from nltk.corpus import wordnet as wn

Expand Down Expand Up @@ -119,14 +119,18 @@ def word_details_api(request, slug: str):
)
wordform = wordform_morphemes(wordform)
wordform = wordform_orth(wordform)

recordings = []

print("here we go")
print(lemma)
for source in settings.SPEECH_DB_EQ:
url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
matched_recs = get_recordings_from_url_with_speaker_info([lemma], url)
if matched_recs:
recordings.extend(matched_recs)

if paradigm is not None:
print("here we go")
if paradigm is not None:
FST_DIR = settings.BASE_DIR / "res" / "fst"
paradigm_manager = ParadigmManager(
layout_directory=settings.LAYOUTS_DIR,
Expand All @@ -148,6 +152,7 @@ def word_details_api(request, slug: str):
}
}


return Response(content)


Expand Down Expand Up @@ -187,15 +192,16 @@ def search_api(request):
:param request:
:return:
"""


query_string = request.GET.get("name")
rw_index = request.GET.get("rw_index")
rw_domain = request.GET.get("rw_domain")
wn_synset = request.GET.get("wn_synset")
dict_source = get_dict_source(request)
search_run = None
include_auto_definitions = request.user.is_authenticated
include_auto_definitions = should_include_auto_definitions(request)
context = dict()

if query_string or rw_index or rw_domain or wn_synset:
search_run = search_with_affixes(
query_string,
Expand All @@ -213,7 +219,6 @@ def search_api(request):
query_string = ""
search_results = []
did_search = False

context.update(
word_search_form=request.data.get("name"),
query_string=query_string,
Expand All @@ -226,11 +231,6 @@ def search_api(request):
context["verbose_messages"] = json.dumps(
search_run.verbose_messages, indent=2, ensure_ascii=False
)

context["search_results"] = fetch_single_recording(
context["search_results"], request
)

for result in context["search_results"]:
result["wordform_text"] = wordform_orth_text(result["wordform_text"])
result["lemma_wordform"]["wordform_text"] = wordform_orth_text(
Expand All @@ -246,10 +246,10 @@ def search_api(request):
result["relabelled_fst_analysis"] = relabelFSTAnalysis(
result["relabelled_fst_analysis"]
)

return Response(context)



def make_wordnet_format(wn_class):
"""
Accepts: wn_class of format (n) bear 1
Expand Down Expand Up @@ -299,31 +299,6 @@ def wordnet_api(request, classification):

return Response(context)


def fetch_single_recording(results, request):
query_terms = []
for result in results:
query_terms.append(result["wordform_text"])

speech_db_eq = settings.SPEECH_DB_EQ
matched_recordings = {}

for search_terms in divide_chunks(query_terms, 30):
for source in speech_db_eq:
url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
matched_recordings.update(get_recordings_from_url(search_terms, url))

for result in results:
if result["wordform_text"] in matched_recordings:
result["recording"] = matched_recordings[result["wordform_text"]][
"recording_url"
]
else:
result["recording"] = ""

return results


def relabelInflectionalCategory(ic):
with open(Path(settings.RESOURCES_DIR / "altlabel.tsv")) as f:
labels = Relabelling.from_tsv(f)
Expand Down

0 comments on commit 211c6c7

Please sign in to comment.