From 211c6c7f438eacbe2e1b14588fe3502e47fbe8c1 Mon Sep 17 00:00:00 2001 From: M1Al3x Date: Wed, 1 Nov 2023 01:27:42 -0600 Subject: [PATCH] Fix semantic search --- src/API/search/__init__.py | 6 ++--- src/API/search/core.py | 4 +-- src/API/search/cvd_search.py | 1 - src/API/search/presentation.py | 2 +- src/API/search/runner.py | 4 +-- src/API/views.py | 2 +- src/helpers.py | 43 ++++++++++++++++++++++------- src/views.py | 49 +++++++++------------------------- 8 files changed, 55 insertions(+), 56 deletions(-) diff --git a/src/API/search/__init__.py b/src/API/search/__init__.py index 11e0b84..e6ac15b 100644 --- a/src/API/search/__init__.py +++ b/src/API/search/__init__.py @@ -6,8 +6,8 @@ def search_with_affixes( rw_index: str, rw_domain: str, wn_synset: str, - include_auto_definitions=False, - inflect_english_phrases=False, + include_auto_definitions=True, + inflect_english_phrases=True, ): """ Search for wordforms matching: @@ -28,7 +28,7 @@ def search_with_affixes( def simple_search( - query: str, include_auto_definitions=False, inflect_english_phrases=False + query: str, include_auto_definitions=True, inflect_english_phrases=True ): """ Search, trying to match full wordforms or keywords within definitions. diff --git a/src/API/search/core.py b/src/API/search/core.py index bf67d32..51db522 100644 --- a/src/API/search/core.py +++ b/src/API/search/core.py @@ -30,14 +30,14 @@ def __init__( rw_index: str, rw_domain: str, wn_synset: str, - include_auto_definitions=None, + include_auto_definitions=True, ): self.query = Query(query) self.rw_index = rw_index self.rw_domain = rw_domain self.wn_synset = wn_synset self.include_auto_definitions = first_non_none_value( - self.query.auto, include_auto_definitions, default=False + self.query.auto, include_auto_definitions, default=True ) self._results = {} self._verbose_messages = [] diff --git a/src/API/search/cvd_search.py b/src/API/search/cvd_search.py index 69454b9..ed6af1b 100644 --- a/src/API/search/cvd_search.py +++ b/src/API/search/cvd_search.py @@ -42,7 +42,6 @@ def do_cvd_search(search_run: SearchRun): cvd_key_to_wordform_query(similarity) for similarity, weight in closest ] similarities = [similarity for cvd_key, similarity in closest] - # Get all possible wordforms in one big query. We will select more than we # need, then filter it down later, but this will have to do until we get # better homonym handling. diff --git a/src/API/search/presentation.py b/src/API/search/presentation.py index d874f86..afca071 100644 --- a/src/API/search/presentation.py +++ b/src/API/search/presentation.py @@ -360,7 +360,7 @@ def serialize_wordform( def serialize_definitions( - definitions, include_auto_definitions=False, dict_source=None + definitions, include_auto_definitions=True, dict_source=None ): ret = [] for definition in definitions: diff --git a/src/API/search/runner.py b/src/API/search/runner.py index 5b0daaf..8aa7c2c 100644 --- a/src/API/search/runner.py +++ b/src/API/search/runner.py @@ -29,8 +29,8 @@ def search( rw_domain: str, wn_synset: str, include_affixes=True, - include_auto_definitions=False, - inflect_english_phrases=False + include_auto_definitions=True, + inflect_english_phrases=True ) -> SearchRun: """ Perform an actual search, using the provided options. diff --git a/src/API/views.py b/src/API/views.py index 681bf5a..e3fd4c6 100644 --- a/src/API/views.py +++ b/src/API/views.py @@ -17,7 +17,7 @@ def click_in_text(request) -> HttpResponse: elif q == "": return HttpResponseBadRequest("query param q is an empty string") - results = simple_search(q, include_auto_definitions=False) + results = simple_search(q, include_auto_definitions=True) response = {"results": results} diff --git a/src/helpers.py b/src/helpers.py index 1f1f8fc..5103e34 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -3,7 +3,7 @@ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- from hfst_optimized_lookup import Analysis - +import threading from analysis import RichAnalysis, rich_analyze_strict """ @@ -11,7 +11,7 @@ """ from urllib.parse import ParseResult, urlencode, urlunparse - +import time import urllib import logging from typing import Optional @@ -369,6 +369,9 @@ def should_inflect_phrases(request): def get_recordings_from_paradigm(paradigm, request): + start = time.time() + threads = [] + temp = [] if request.COOKIES.get("paradigm_audio") == "no": return paradigm @@ -386,21 +389,36 @@ def get_recordings_from_paradigm(paradigm, request): if request.COOKIES.get("synthesized_audio_in_paradigm") == "yes": speech_db_eq.insert(0, "synth") - + query_terms = [query_terms[0], query_terms[1], query_terms[2], query_terms[3],query_terms[4] ] + for search_terms in divide_chunks(query_terms, 30): + for source in speech_db_eq: + temp.append(None) + index = 0 for search_terms in divide_chunks(query_terms, 30): for source in speech_db_eq: url = f"https://speech-db.altlab.app/{source}/api/bulk_search" - matched_recordings.update(get_recordings_from_url(search_terms, url)) - + x = threading.Thread(target=get_recordings_from_url, args=(search_terms, url, temp, index,)) + threads.append(x) + x.start() + index += 1 + + for i in range(len(threads)): + threads[i].join() + + for item in temp: + matched_recordings.update(item) + end = time.time() + print(end - start) paradigm = paradigm.bulk_add_recordings(matched_recordings) - return paradigm -def get_recordings_from_url(search_terms, url): +def get_recordings_from_url(search_terms, url, temp, index): matched_recordings = {} query_params = [("q", term) for term in search_terms] + print(url) response = requests.get(url + "?" + urllib.parse.urlencode(query_params)) + print("whyyyyyy") if response.status_code == 200: recordings = response.json() @@ -409,13 +427,20 @@ def get_recordings_from_url(search_terms, url): matched_recordings[entry] = {} matched_recordings[entry]["recording_url"] = recording["recording_url"] matched_recordings[entry]["speaker"] = recording["speaker"] - - return matched_recordings + print("ahahahahahah") + print(matched_recordings) + temp[index] = matched_recordings + print("...............................................") + print(temp[index]) + print("...............................................") def get_recordings_from_url_with_speaker_info(search_terms, url): query_params = [("q", term) for term in search_terms] + print(query_params) + print(url) response = requests.get(url + "?" + urllib.parse.urlencode(query_params)) + print("DHJJJDJJDJDJDJDJ") if response.status_code == 200: recordings = response.json() return recordings["matched_recordings"] diff --git a/src/views.py b/src/views.py index 373dfac..5d165f3 100644 --- a/src/views.py +++ b/src/views.py @@ -3,7 +3,7 @@ import json import logging from pathlib import Path - +import time from typing import Dict, Literal from nltk.corpus import wordnet as wn @@ -119,14 +119,18 @@ def word_details_api(request, slug: str): ) wordform = wordform_morphemes(wordform) wordform = wordform_orth(wordform) + recordings = [] + + print("here we go") + print(lemma) for source in settings.SPEECH_DB_EQ: url = f"https://speech-db.altlab.app/{source}/api/bulk_search" matched_recs = get_recordings_from_url_with_speaker_info([lemma], url) if matched_recs: recordings.extend(matched_recs) - - if paradigm is not None: + print("here we go") + if paradigm is not None: FST_DIR = settings.BASE_DIR / "res" / "fst" paradigm_manager = ParadigmManager( layout_directory=settings.LAYOUTS_DIR, @@ -148,6 +152,7 @@ def word_details_api(request, slug: str): } } + return Response(content) @@ -187,15 +192,16 @@ def search_api(request): :param request: :return: """ + + query_string = request.GET.get("name") rw_index = request.GET.get("rw_index") rw_domain = request.GET.get("rw_domain") wn_synset = request.GET.get("wn_synset") dict_source = get_dict_source(request) search_run = None - include_auto_definitions = request.user.is_authenticated + include_auto_definitions = should_include_auto_definitions(request) context = dict() - if query_string or rw_index or rw_domain or wn_synset: search_run = search_with_affixes( query_string, @@ -213,7 +219,6 @@ def search_api(request): query_string = "" search_results = [] did_search = False - context.update( word_search_form=request.data.get("name"), query_string=query_string, @@ -226,11 +231,6 @@ def search_api(request): context["verbose_messages"] = json.dumps( search_run.verbose_messages, indent=2, ensure_ascii=False ) - - context["search_results"] = fetch_single_recording( - context["search_results"], request - ) - for result in context["search_results"]: result["wordform_text"] = wordform_orth_text(result["wordform_text"]) result["lemma_wordform"]["wordform_text"] = wordform_orth_text( @@ -246,10 +246,10 @@ def search_api(request): result["relabelled_fst_analysis"] = relabelFSTAnalysis( result["relabelled_fst_analysis"] ) - return Response(context) + def make_wordnet_format(wn_class): """ Accepts: wn_class of format (n) bear 1 @@ -299,31 +299,6 @@ def wordnet_api(request, classification): return Response(context) - -def fetch_single_recording(results, request): - query_terms = [] - for result in results: - query_terms.append(result["wordform_text"]) - - speech_db_eq = settings.SPEECH_DB_EQ - matched_recordings = {} - - for search_terms in divide_chunks(query_terms, 30): - for source in speech_db_eq: - url = f"https://speech-db.altlab.app/{source}/api/bulk_search" - matched_recordings.update(get_recordings_from_url(search_terms, url)) - - for result in results: - if result["wordform_text"] in matched_recordings: - result["recording"] = matched_recordings[result["wordform_text"]][ - "recording_url" - ] - else: - result["recording"] = "" - - return results - - def relabelInflectionalCategory(ic): with open(Path(settings.RESOURCES_DIR / "altlabel.tsv")) as f: labels = Relabelling.from_tsv(f)