From 6320482fa0ceaeffdf9c883e5d27916c687b13ff Mon Sep 17 00:00:00 2001 From: Matti Lamppu Date: Thu, 29 Aug 2024 12:31:42 +0300 Subject: [PATCH] Refactor text search functions Simplify text search to a single function --- .../types/allocated_time_slot/filtersets.py | 18 ++---- api/graphql/types/application/filtersets.py | 11 ++-- .../types/application_section/filtersets.py | 11 ++-- .../types/rejected_occurrence/filtersets.py | 9 ++- api/graphql/types/reservation/filtersets.py | 16 ++---- common/db.py | 56 +++++++++++++++---- common/utils.py | 8 +++ .../test_application/test_filtering.py | 6 +- 8 files changed, 78 insertions(+), 57 deletions(-) diff --git a/api/graphql/types/allocated_time_slot/filtersets.py b/api/graphql/types/allocated_time_slot/filtersets.py index 75afef523..f6f36eec0 100644 --- a/api/graphql/types/allocated_time_slot/filtersets.py +++ b/api/graphql/types/allocated_time_slot/filtersets.py @@ -1,5 +1,4 @@ import django_filters -from django.contrib.postgres.search import SearchVector from django.db import models from graphene_django_extensions import ModelFilterSet from graphene_django_extensions.filters import EnumMultipleChoiceFilter, IntChoiceFilter, IntMultipleChoiceFilter @@ -8,7 +7,7 @@ from applications.enums import ApplicantTypeChoice, ApplicationSectionStatusChoice, Weekday from applications.models import AllocatedTimeSlot from applications.querysets.allocated_time_slot import AllocatedTimeSlotQuerySet -from common.db import raw_prefixed_query +from common.db import text_search __all__ = [ "AllocatedTimeSlotFilterSet", @@ -58,22 +57,15 @@ class Meta: def filter_by_section_status(qs: AllocatedTimeSlotQuerySet, name: str, value: list[str]) -> models.QuerySet: return qs.has_section_status_in(value) - @staticmethod - def filter_text_search(qs: AllocatedTimeSlotQuerySet, name: str, value: str) -> models.QuerySet: - # If this becomes slow, look into optimisation strategies here: - # https://docs.djangoproject.com/en/4.2/ref/contrib/postgres/search/#performance - vector = SearchVector( + def filter_text_search(self, qs: AllocatedTimeSlotQuerySet, name: str, value: str) -> models.QuerySet: + fields = ( "reservation_unit_option__application_section__id", "reservation_unit_option__application_section__name", "reservation_unit_option__application_section__application__id", "applicant", ) - query = raw_prefixed_query(value) - return ( - qs.alias(applicant=L("reservation_unit_option__application_section__application__applicant")) - .annotate(search=vector) - .filter(search=query) - ) + qs = qs.alias(applicant=L("reservation_unit_option__application_section__application__applicant")) + return text_search(qs=qs, fields=fields, text=value) @staticmethod def order_by_allocated_time_of_week(qs: AllocatedTimeSlotQuerySet, desc: bool) -> models.QuerySet: diff --git a/api/graphql/types/application/filtersets.py b/api/graphql/types/application/filtersets.py index 42c4bd2ac..da6b8e13a 100644 --- a/api/graphql/types/application/filtersets.py +++ b/api/graphql/types/application/filtersets.py @@ -1,5 +1,4 @@ import django_filters -from django.contrib.postgres.search import SearchVector from django.db import models from django.db.models import QuerySet from graphene_django_extensions import ModelFilterSet @@ -9,7 +8,7 @@ from applications.enums import ApplicantTypeChoice, ApplicationStatusChoice from applications.models import Application from applications.querysets.application import ApplicationQuerySet -from common.db import raw_prefixed_query +from common.db import text_search __all__ = [ "ApplicationFilterSet", @@ -39,11 +38,9 @@ class Meta: @staticmethod def filter_by_text_search(qs: ApplicationQuerySet, name: str, value: str) -> models.QuerySet: - # If this becomes slow, look into optimisation strategies here: - # https://docs.djangoproject.com/en/4.2/ref/contrib/postgres/search/#performance - vector = SearchVector("id", "application_sections__id", "application_sections__name", "applicant") - query = raw_prefixed_query(value) - return qs.alias(applicant=L("applicant")).annotate(search=vector).filter(search=query) + fields = ("id", "application_sections__id", "application_sections__name", "applicant") + qs = qs.alias(applicant=L("applicant")) + return text_search(qs=qs, fields=fields, text=value) @staticmethod def filter_by_status(qs: ApplicationQuerySet, name: str, value: list[str]) -> QuerySet: diff --git a/api/graphql/types/application_section/filtersets.py b/api/graphql/types/application_section/filtersets.py index 8af71f6b1..9eccee630 100644 --- a/api/graphql/types/application_section/filtersets.py +++ b/api/graphql/types/application_section/filtersets.py @@ -1,7 +1,6 @@ from typing import Any import django_filters -from django.contrib.postgres.search import SearchVector from django.db import models from django.db.models import QuerySet from django.db.models.functions import Lower @@ -16,7 +15,7 @@ from applications.enums import ApplicantTypeChoice, ApplicationSectionStatusChoice, ApplicationStatusChoice, Priority from applications.models import ApplicationSection from applications.querysets.application_section import ApplicationSectionQuerySet -from common.db import raw_prefixed_query +from common.db import text_search __all__ = [ "ApplicationSectionFilterSet", @@ -87,11 +86,9 @@ def filter_include_preferred_order(qs: ApplicationSectionQuerySet, name: str, va @staticmethod def filter_text_search(qs: ApplicationSectionQuerySet, name: str, value: str) -> QuerySet: - # If this becomes slow, look into optimisation strategies here: - # https://docs.djangoproject.com/en/4.2/ref/contrib/postgres/search/#performance - vector = SearchVector("application__id", "id", "name", "applicant") - query = raw_prefixed_query(value) - return qs.alias(applicant=L("application__applicant")).annotate(search=vector).filter(search=query) + fields = ("application__id", "id", "name", "applicant") + qs = qs.alias(applicant=L("application__applicant")) + return text_search(qs=qs, fields=fields, text=value) def filter_has_allocations(self, queryset: ApplicationSectionQuerySet, name: str, value: bool) -> QuerySet: if value: diff --git a/api/graphql/types/rejected_occurrence/filtersets.py b/api/graphql/types/rejected_occurrence/filtersets.py index 3d4e3dcf1..536160f4e 100644 --- a/api/graphql/types/rejected_occurrence/filtersets.py +++ b/api/graphql/types/rejected_occurrence/filtersets.py @@ -1,11 +1,10 @@ import django_filters -from django.contrib.postgres.search import SearchVector from django.db import models from graphene_django_extensions import ModelFilterSet from graphene_django_extensions.filters import IntChoiceFilter, IntMultipleChoiceFilter from lookup_property import L -from common.db import raw_prefixed_query +from common.db import text_search from reservations.models import RejectedOccurrence from reservations.querysets import RejectedOccurrenceQuerySet @@ -79,13 +78,12 @@ class Meta: def filter_text_search(qs: RejectedOccurrenceQuerySet, name: str, value: str) -> models.QuerySet: # If this becomes slow, look into optimisation strategies here: # https://docs.djangoproject.com/en/4.2/ref/contrib/postgres/search/#performance - vector = SearchVector( + fields = ( "recurring_reservation__allocated_time_slot__reservation_unit_option__application_section__id", "recurring_reservation__allocated_time_slot__reservation_unit_option__application_section__name", "recurring_reservation__allocated_time_slot__reservation_unit_option__application_section__application__id", "applicant", ) - query = raw_prefixed_query(value) applicant_ref = ( "recurring_reservation" "__allocated_time_slot" @@ -94,7 +92,8 @@ def filter_text_search(qs: RejectedOccurrenceQuerySet, name: str, value: str) -> "__application" "__applicant" ) - return qs.alias(applicant=L(applicant_ref)).annotate(search=vector).filter(search=query) + qs = qs.alias(applicant=L(applicant_ref)) + return text_search(qs=qs, fields=fields, text=value) @staticmethod def order_by_applicant(qs: RejectedOccurrenceQuerySet, desc: bool) -> models.QuerySet: diff --git a/api/graphql/types/reservation/filtersets.py b/api/graphql/types/reservation/filtersets.py index c46767085..33fbebbfc 100644 --- a/api/graphql/types/reservation/filtersets.py +++ b/api/graphql/types/reservation/filtersets.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING import django_filters -from django.contrib.postgres.search import SearchRank, SearchVector from django.db import models from django.db.models import Case, CharField, F, Q, QuerySet, Value, When from django.db.models.functions import Concat @@ -10,7 +9,7 @@ from graphene_django_extensions.filters import EnumMultipleChoiceFilter, IntMultipleChoiceFilter from api.graphql.extensions.filters import TimezoneAwareDateFilter -from common.db import raw_prefixed_query +from common.db import text_search from merchants.enums import OrderStatusWithFree from permissions.helpers import has_general_permission from permissions.models import GeneralPermissionChoices, UnitPermissionChoices @@ -148,8 +147,7 @@ def filter_by_reservation_unit_name(qs: QuerySet, name: str, value: str) -> Quer return qs.filter(q).distinct() - @staticmethod - def filter_by_text_search(qs: QuerySet, name: str, value: str) -> QuerySet: + def filter_by_text_search(self, qs: QuerySet, name: str, value: str) -> QuerySet: value = value.strip() if not value: return qs @@ -159,7 +157,7 @@ def filter_by_text_search(qs: QuerySet, name: str, value: str) -> QuerySet: return qs.filter(Q(user__email__icontains=value) | Q(reservee_email__icontains=value)) if len(value) >= 3: - vector = SearchVector( + fields = ( "pk", "name", "reservee_id", @@ -172,13 +170,7 @@ def filter_by_text_search(qs: QuerySet, name: str, value: str) -> QuerySet: "user__last_name", "recurring_reservation__name", ) - query = raw_prefixed_query(value) - text_search_rank = SearchRank(vector, query) - return ( - qs.annotate(search=vector, text_search_rank=text_search_rank) - .filter(search=query) - .order_by("-text_search_rank") # most relevant first - ) + return text_search(qs=qs, fields=fields, text=value) if value.isnumeric(): return qs.filter(pk=int(value)) diff --git a/common/db.py b/common/db.py index cc52a3ef6..9efb273f1 100644 --- a/common/db.py +++ b/common/db.py @@ -1,7 +1,8 @@ -from typing import Any +from collections.abc import Iterable +from typing import Any, Literal, TypeVar from django.contrib.postgres.fields import ArrayField -from django.contrib.postgres.search import SearchQuery +from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from django.db import models __all__ = [ @@ -10,7 +11,7 @@ "SubqueryArray", "SubqueryCount", "SubquerySum", - "raw_prefixed_query", + "text_search", ] @@ -161,13 +162,48 @@ class ArrayUnnest(models.Func): arity = 1 -def raw_prefixed_query(text: str) -> SearchQuery: +TQuerySet = TypeVar("TQuerySet") + + +def text_search( + qs: TQuerySet, + fields: Iterable[str], + text: str, + language: Literal["finnish", "english", "swedish"] = "finnish", +) -> TQuerySet: """ - Create a query that searches for each word separately and matched partial words if match is a prefix. - Remove any whitespace and replace any single quote mark with two quote marks. + Query with postgres full text search. https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY + + :param qs: QuerySet to filter. + :param fields: Fields to search. + :param text: Text to search for. + :param language: Language to search in. + """ + # If this becomes slow, look into optimisation strategies here: + # https://docs.djangoproject.com/en/5.1/ref/contrib/postgres/search/#performance + vector = SearchVector(*fields, config=language) + + search = build_search(text) + query = SearchQuery(value=search, config=language, search_type="raw") + rank = SearchRank(vector, query) + return qs.annotate(ts_vector=vector, ts_rank=rank).filter(ts_vector=query) + + +def build_search(text: str) -> str: + """ + Build raw postgres full text search query from text. + + Replace single quotes with two single quotes so that they are treated as whitespace in the search. + Quote search terms, and support prefix matching. + Match all search terms with the OR operator. + + Ref. https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY """ - return SearchQuery( - " | ".join(f"""'{value.replace("'", "''")}':*""" for value in text.split(" ") if value != ""), - search_type="raw", - ) + search_terms: list[str] = [] + for value in text.split(" "): + if value: + value = value.replace("'", "''") + value = f"'{value}':*" + search_terms.append(value) + return " | ".join(search_terms) diff --git a/common/utils.py b/common/utils.py index cae6d4800..658c22c9f 100644 --- a/common/utils.py +++ b/common/utils.py @@ -4,8 +4,10 @@ from django.conf import settings from django.db import models +from django.http import HttpRequest from django.utils import translation from django.utils.functional import Promise +from django.utils.translation import get_language_from_request from modeltranslation.manager import get_translatable_fields_for_model from users.models import User @@ -14,6 +16,7 @@ "comma_sep_str", "get_attr_by_language", "get_field_to_related_field_mapping", + "get_text_search_language", "get_translation_fields", "with_indices", ] @@ -148,6 +151,11 @@ def translate_for_user(text: Promise, user: User) -> str: return str(text) +def get_text_search_language(request: HttpRequest) -> Literal["finnish", "english", "swedish"]: + lang_code: Literal["fi", "en", "sv"] = get_language_from_request(request) + return "swedish" if lang_code == "sv" else "english" if lang_code == "en" else "finnish" + + def safe_getattr(obj: object, dotted_path: str, default: Any = None) -> Any: """ Examples: diff --git a/tests/test_graphql_api/test_application/test_filtering.py b/tests/test_graphql_api/test_application/test_filtering.py index e0635c640..2ea36f5cc 100644 --- a/tests/test_graphql_api/test_application/test_filtering.py +++ b/tests/test_graphql_api/test_application/test_filtering.py @@ -261,19 +261,19 @@ def test_application__filter__by_text_search__section_name__prefix(graphql): organisation=None, contact_person=None, user=None, - application_sections__name="foo", + application_sections__name="kirjastoryhmä", ) ApplicationFactory.create_in_status_draft( organisation=None, contact_person=None, user=None, - application_sections__name="bar", + application_sections__name="suunnistusryhmä", ) graphql.login_user_based_on_type(UserType.SUPERUSER) # when: # - User tries to filter applications with a text search, which is only a prefix match - query = applications_query(text_search="fo") + query = applications_query(text_search="kirjasto") response = graphql(query) # then: