Fix semantic search

UAlbertaALTLab · Nov 1, 2023 · 211c6c7 · 211c6c7
1 parent 58663bb
commit 211c6c7
Show file tree

Hide file tree

Showing 8 changed files with 55 additions and 56 deletions.
diff --git a/src/API/search/__init__.py b/src/API/search/__init__.py
@@ -6,8 +6,8 @@ def search_with_affixes(
     rw_index: str,
     rw_domain: str,
     wn_synset: str,
-    include_auto_definitions=False,
-    inflect_english_phrases=False,
+    include_auto_definitions=True,
+    inflect_english_phrases=True,
 ):
     """
     Search for wordforms matching:
@@ -28,7 +28,7 @@ def search_with_affixes(
 
 
 def simple_search(
-    query: str, include_auto_definitions=False, inflect_english_phrases=False
+    query: str, include_auto_definitions=True, inflect_english_phrases=True
 ):
     """
     Search, trying to match full wordforms or keywords within definitions.

diff --git a/src/API/search/core.py b/src/API/search/core.py
@@ -30,14 +30,14 @@ def __init__(
         rw_index: str,
         rw_domain: str,
         wn_synset: str,
-        include_auto_definitions=None,
+        include_auto_definitions=True,
     ):
         self.query = Query(query)
         self.rw_index = rw_index
         self.rw_domain = rw_domain
         self.wn_synset = wn_synset
         self.include_auto_definitions = first_non_none_value(
-            self.query.auto, include_auto_definitions, default=False
+            self.query.auto, include_auto_definitions, default=True
         )
         self._results = {}
         self._verbose_messages = []

diff --git a/src/API/search/cvd_search.py b/src/API/search/cvd_search.py
@@ -42,7 +42,6 @@ def do_cvd_search(search_run: SearchRun):
         cvd_key_to_wordform_query(similarity) for similarity, weight in closest
     ]
     similarities = [similarity for cvd_key, similarity in closest]
-
     # Get all possible wordforms in one big query. We will select more than we
     # need, then filter it down later, but this will have to do until we get
     # better homonym handling.

diff --git a/src/API/search/presentation.py b/src/API/search/presentation.py
@@ -360,7 +360,7 @@ def serialize_wordform(
 
 
 def serialize_definitions(
-    definitions, include_auto_definitions=False, dict_source=None
+    definitions, include_auto_definitions=True, dict_source=None
 ):
     ret = []
     for definition in definitions:

diff --git a/src/API/search/runner.py b/src/API/search/runner.py
@@ -29,8 +29,8 @@ def search(
     rw_domain: str,
     wn_synset: str,
     include_affixes=True,
-    include_auto_definitions=False,
-    inflect_english_phrases=False
+    include_auto_definitions=True,
+    inflect_english_phrases=True
 ) -> SearchRun:
     """
     Perform an actual search, using the provided options.

diff --git a/src/API/views.py b/src/API/views.py
@@ -17,7 +17,7 @@ def click_in_text(request) -> HttpResponse:
     elif q == "":
         return HttpResponseBadRequest("query param q is an empty string")
 
-    results = simple_search(q, include_auto_definitions=False)
+    results = simple_search(q, include_auto_definitions=True)
 
     response = {"results": results}
 

diff --git a/src/helpers.py b/src/helpers.py
@@ -3,15 +3,15 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 from hfst_optimized_lookup import Analysis
-
+import threading
 from analysis import RichAnalysis, rich_analyze_strict
 
 """
 Helper functions for the views file.
 """
 
 from urllib.parse import ParseResult, urlencode, urlunparse
-
+import time
 import urllib
 import logging
 from typing import Optional
@@ -369,6 +369,9 @@ def should_inflect_phrases(request):
 
 
 def get_recordings_from_paradigm(paradigm, request):
+    start = time.time()
+    threads = []
+    temp = []
     if request.COOKIES.get("paradigm_audio") == "no":
         return paradigm
 
@@ -386,21 +389,36 @@ def get_recordings_from_paradigm(paradigm, request):
 
     if request.COOKIES.get("synthesized_audio_in_paradigm") == "yes":
         speech_db_eq.insert(0, "synth")
-
+    query_terms = [query_terms[0], query_terms[1], query_terms[2], query_terms[3],query_terms[4] ]
+    for search_terms in divide_chunks(query_terms, 30):
+        for source in speech_db_eq:
+            temp.append(None)
+    index = 0
     for search_terms in divide_chunks(query_terms, 30):
         for source in speech_db_eq:
             url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
-            matched_recordings.update(get_recordings_from_url(search_terms, url))
-
+            x = threading.Thread(target=get_recordings_from_url, args=(search_terms, url, temp, index,))
+            threads.append(x)
+            x.start()
+            index += 1
+
+    for i in range(len(threads)):
+        threads[i].join()
+
+    for item in temp:
+        matched_recordings.update(item)
+    end = time.time()
+    print(end - start)
     paradigm = paradigm.bulk_add_recordings(matched_recordings)
-
     return paradigm
 
 
-def get_recordings_from_url(search_terms, url):
+def get_recordings_from_url(search_terms, url, temp, index):
     matched_recordings = {}
     query_params = [("q", term) for term in search_terms]
+    print(url)
     response = requests.get(url + "?" + urllib.parse.urlencode(query_params))
+    print("whyyyyyy")
     if response.status_code == 200:
         recordings = response.json()
 
@@ -409,13 +427,20 @@ def get_recordings_from_url(search_terms, url):
             matched_recordings[entry] = {}
             matched_recordings[entry]["recording_url"] = recording["recording_url"]
             matched_recordings[entry]["speaker"] = recording["speaker"]
-
-    return matched_recordings
+    print("ahahahahahah")
+    print(matched_recordings)
+    temp[index] = matched_recordings
+    print("...............................................")
+    print(temp[index])
+    print("...............................................")
 
 
 def get_recordings_from_url_with_speaker_info(search_terms, url):
     query_params = [("q", term) for term in search_terms]
+    print(query_params)
+    print(url)
     response = requests.get(url + "?" + urllib.parse.urlencode(query_params))
+    print("DHJJJDJJDJDJDJDJ")
     if response.status_code == 200:
         recordings = response.json()
         return recordings["matched_recordings"]

diff --git a/src/views.py b/src/views.py
@@ -3,7 +3,7 @@
 import json
 import logging
 from pathlib import Path
-
+import time
 from typing import Dict, Literal
 from nltk.corpus import wordnet as wn
 
@@ -119,14 +119,18 @@ def word_details_api(request, slug: str):
     )
     wordform = wordform_morphemes(wordform)
     wordform = wordform_orth(wordform)
+
     recordings = []
+
+    print("here we go")
+    print(lemma)
     for source in settings.SPEECH_DB_EQ:
         url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
         matched_recs = get_recordings_from_url_with_speaker_info([lemma], url)
         if matched_recs:
             recordings.extend(matched_recs)
-
-    if paradigm is not None:
+    print("here we go")
+    if paradigm is not None: 
         FST_DIR = settings.BASE_DIR / "res" / "fst"
         paradigm_manager = ParadigmManager(
             layout_directory=settings.LAYOUTS_DIR,
@@ -148,6 +152,7 @@ def word_details_api(request, slug: str):
         }
     }
 
+
     return Response(content)
 
 
@@ -187,15 +192,16 @@ def search_api(request):
     :param request:
     :return:
     """
+
+
     query_string = request.GET.get("name")
     rw_index = request.GET.get("rw_index")
     rw_domain = request.GET.get("rw_domain")
     wn_synset = request.GET.get("wn_synset")
     dict_source = get_dict_source(request)
     search_run = None
-    include_auto_definitions = request.user.is_authenticated
+    include_auto_definitions = should_include_auto_definitions(request)
     context = dict()
-
     if query_string or rw_index or rw_domain or wn_synset:
         search_run = search_with_affixes(
             query_string,
@@ -213,7 +219,6 @@ def search_api(request):
         query_string = ""
         search_results = []
         did_search = False
-
     context.update(
         word_search_form=request.data.get("name"),
         query_string=query_string,
@@ -226,11 +231,6 @@ def search_api(request):
         context["verbose_messages"] = json.dumps(
             search_run.verbose_messages, indent=2, ensure_ascii=False
         )
-
-    context["search_results"] = fetch_single_recording(
-        context["search_results"], request
-    )
-
     for result in context["search_results"]:
         result["wordform_text"] = wordform_orth_text(result["wordform_text"])
         result["lemma_wordform"]["wordform_text"] = wordform_orth_text(
@@ -246,10 +246,10 @@ def search_api(request):
             result["relabelled_fst_analysis"] = relabelFSTAnalysis(
                 result["relabelled_fst_analysis"]
             )
-
     return Response(context)
 
 
+
 def make_wordnet_format(wn_class):
     """
     Accepts: wn_class of format (n) bear 1
@@ -299,31 +299,6 @@ def wordnet_api(request, classification):
 
     return Response(context)
 
-
-def fetch_single_recording(results, request):
-    query_terms = []
-    for result in results:
-        query_terms.append(result["wordform_text"])
-
-    speech_db_eq = settings.SPEECH_DB_EQ
-    matched_recordings = {}
-
-    for search_terms in divide_chunks(query_terms, 30):
-        for source in speech_db_eq:
-            url = f"https://speech-db.altlab.app/{source}/api/bulk_search"
-            matched_recordings.update(get_recordings_from_url(search_terms, url))
-
-    for result in results:
-        if result["wordform_text"] in matched_recordings:
-            result["recording"] = matched_recordings[result["wordform_text"]][
-                "recording_url"
-            ]
-        else:
-            result["recording"] = ""
-
-    return results
-
-
 def relabelInflectionalCategory(ic):
     with open(Path(settings.RESOURCES_DIR / "altlabel.tsv")) as f:
         labels = Relabelling.from_tsv(f)