From 6dc5273a100b4bdfd16e7ec682d77047043938d6 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 23 Dec 2024 10:07:07 -0600 Subject: [PATCH] fix(search): Remove function_score from count queries - Removed redundant test for RECAP Search --- cl/lib/elasticsearch_utils.py | 45 ++++++-------- cl/search/api_utils.py | 21 ++++--- cl/search/constants.py | 25 ++++++++ cl/search/tests/tests_es_opinion.py | 32 +++++----- cl/search/tests/tests_es_oral_arguments.py | 70 +++++++++++----------- cl/search/tests/tests_es_recap.py | 66 +++++++------------- 6 files changed, 129 insertions(+), 130 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 0e55027b4d..af76c5e220 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -72,6 +72,7 @@ SEARCH_RECAP_PARENT_QUERY_FIELDS, api_child_highlight_map, cardinality_query_unique_ids, + date_decay_relevance_types, recap_boosts_es, ) from cl.search.exception import ( @@ -2127,15 +2128,27 @@ def merge_unavailable_fields_on_parent_document( def clean_count_query(search_query: Search) -> SearchDSL: """Cleans a given ES Search object for a count query. - Modifies the input Search object by removing 'inner_hits' from - any 'has_child' queries within the 'should' clause of the boolean query. + Modifies the input Search object by removing 'function_score' from the main + query if present and/or 'inner_hits' from any 'has_child' queries within + the 'should' clause of the boolean query. It then creates a new Search object with the modified query. :param search_query: The ES Search object. :return: A new ES Search object with the count query. """ - parent_total_query_dict = search_query.to_dict() + parent_total_query_dict = search_query.to_dict(count=True) + try: + # Clean function_score in queries that contain it + parent_total_query_dict = parent_total_query_dict["query"][ + "function_score" + ] + del parent_total_query_dict["boost_mode"] + del parent_total_query_dict["functions"] + except KeyError: + # Omit queries that don't contain it. + pass + try: # Clean the has_child query in queries that contain it. for query in parent_total_query_dict["query"]["bool"]["should"]: @@ -2571,29 +2584,9 @@ def apply_custom_score_to_main_query( else False ) - valid_decay_relevance_types: dict[str, dict[str, str | int | float]] = { - SEARCH_TYPES.OPINION: { - "field": "dateFiled", - "scale": 50, - "decay": 0.5, - }, - SEARCH_TYPES.RECAP: {"field": "dateFiled", "scale": 50, "decay": 0.5}, - SEARCH_TYPES.DOCKETS: { - "field": "dateFiled", - "scale": 50, - "decay": 0.5, - }, - SEARCH_TYPES.RECAP_DOCUMENT: { - "field": "dateFiled", - "scale": 50, - "decay": 0.5, - }, - SEARCH_TYPES.ORAL_ARGUMENT: { - "field": "dateArgued", - "scale": 50, - "decay": 0.5, - }, - } + valid_decay_relevance_types: dict[str, dict[str, str | int | float]] = ( + date_decay_relevance_types + ) main_order_by = cd.get("order_by", "") if is_valid_custom_score_field and api_version == "v4": # Applies a custom function score to sort Documents based on diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index 9c853af182..23ff86cdfa 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -12,6 +12,7 @@ build_cardinality_count, build_es_main_query, build_sort_results, + clean_count_query, do_collapse_count_query, do_count_query, do_es_api_query, @@ -21,7 +22,6 @@ set_results_highlights, ) from cl.lib.search_utils import store_search_api_query -from cl.lib.utils import map_to_docket_entry_sorting from cl.search.constants import SEARCH_HL_TAG, cardinality_query_unique_ids from cl.search.documents import ( AudioDocument, @@ -260,12 +260,8 @@ def get_paginated_results( self.main_query = self.main_query.sort(default_sorting, unique_sorting) # Cardinality query parameters - query = Q(self.main_query.to_dict(count=True)["query"]) + main_count_query = clean_count_query(self.main_query) unique_field = cardinality_query_unique_ids[self.clean_data["type"]] - search_document = self.cardinality_base_document[ - self.clean_data["type"] - ] - main_count_query = search_document.search().query(query) cardinality_query = build_cardinality_count( main_count_query, unique_field ) @@ -273,10 +269,16 @@ def get_paginated_results( # Build a cardinality query to count child documents. child_cardinality_query = None child_cardinality_count_response = None - if self.child_docs_query: + if ( + self.child_docs_query + and self.clean_data["type"] == SEARCH_TYPES.RECAP + ): child_unique_field = cardinality_query_unique_ids[ SEARCH_TYPES.RECAP_DOCUMENT ] + search_document = self.cardinality_base_document[ + self.clean_data["type"] + ] child_count_query = search_document.search().query( self.child_docs_query ) @@ -292,7 +294,10 @@ def get_paginated_results( ) # If a cardinality query is available for the search_type, add it # to the multi-search query. - if child_cardinality_query: + if ( + child_cardinality_query + and self.clean_data["type"] == SEARCH_TYPES.RECAP + ): multi_search = multi_search.add(child_cardinality_query) responses = multi_search.execute() diff --git a/cl/search/constants.py b/cl/search/constants.py index f9d8b610f3..78b02bcf3c 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -275,3 +275,28 @@ SEARCH_TYPES.ORAL_ARGUMENT: "id", SEARCH_TYPES.PARENTHETICAL: "id", } + + +date_decay_relevance_types = { + SEARCH_TYPES.OPINION: { + "field": "dateFiled", + "scale": 50, + "decay": 0.5, + }, + SEARCH_TYPES.RECAP: {"field": "dateFiled", "scale": 50, "decay": 0.5}, + SEARCH_TYPES.DOCKETS: { + "field": "dateFiled", + "scale": 50, + "decay": 0.5, + }, + SEARCH_TYPES.RECAP_DOCUMENT: { + "field": "dateFiled", + "scale": 50, + "decay": 0.5, + }, + SEARCH_TYPES.ORAL_ARGUMENT: { + "field": "dateArgued", + "scale": 50, + "decay": 0.5, + }, +} diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 08eb9c6a3b..936ded9b6d 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2281,7 +2281,7 @@ def setUpTestData(cls): # Rebuild the Opinion index cls.rebuild_index("search.OpinionCluster") - # Same keywords but different date_filed + # Same keywords but different dateFiled cls.opinion_old = OpinionClusterFactory.create( case_name="Keyword Match", case_name_full="", @@ -2425,7 +2425,7 @@ def setUpTestData(cls): cls.test_cases = [ { - "name": "Same keywords, order by score desc", + "name": "Same keywords, different dateFiled", "search_params": { "q": "Keyword Match", "order_by": "score desc", @@ -2435,13 +2435,13 @@ def setUpTestData(cls): cls.opinion_recent.docket.docket_number, # Most recent dateFiled cls.opinion_old.docket.docket_number, # Oldest dateFiled ], - "expected_order": [ - cls.opinion_recent.pk, # Most recent dateFiled - cls.opinion_old.pk, # Oldest dateFiled + "expected_order": [ # API + cls.opinion_recent.pk, + cls.opinion_old.pk, ], }, { - "name": "Different relevancy same dateFiled, order by score desc", + "name": "Different relevancy same dateFiled", "search_params": { "q": "Highly Relevant Keywords", "order_by": "score desc", @@ -2451,23 +2451,23 @@ def setUpTestData(cls): cls.opinion_high_relevance.docket.docket_number, # Most relevant by keywords cls.opinion_low_relevance.docket.docket_number, # Less relevant by keywords ], - "expected_order": [ + "expected_order": [ # API cls.opinion_high_relevance.pk, # Most relevant by keywords cls.opinion_low_relevance.pk, # Less relevant by keywords ], }, { - "name": "Different relevancy different dateFiled, order by score desc", + "name": "Different relevancy and different dateFiled", "search_params": { "q": "Ipsum Dolor Terms", "order_by": "score desc", "type": SEARCH_TYPES.OPINION, }, "expected_order_frontend": [ - cls.opinion_low_relevance_new_date.docket.docket_number, + cls.opinion_low_relevance_new_date.docket.docket_number, # Combination of relevance and date rank it first. cls.opinion_high_relevance_old_date.docket.docket_number, ], - "expected_order": [ + "expected_order": [ # API cls.opinion_low_relevance_new_date.pk, cls.opinion_high_relevance_old_date.pk, ], @@ -2483,23 +2483,23 @@ def setUpTestData(cls): "expected_order_frontend": [ cls.opinion_low_relevance_new_date.docket.docket_number, # 2024-12-23 1:21-bk-1241 cls.opinion_recent.docket.docket_number, # 2024-02-23 1:21-bk-1236 - cls.opinion_high_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1237 + cls.opinion_high_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1237 Indexed first, displayed first. cls.opinion_low_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1238 cls.opinion_high_relevance_old_date.docket.docket_number, # 1800-02-23 1:21-bk-1239 cls.opinion_old.docket.docket_number, # 1732-02-23 1:21-bk-1235 ], - "expected_order": [ + "expected_order": [ # V4 API cls.opinion_low_relevance_new_date.pk, # 2024-12-23 cls.opinion_recent.pk, # 2024-02-23 - cls.opinion_low_relevance.pk, # 2022-02-23 Higher PK - cls.opinion_high_relevance.pk, # 2022-02-23 More relevant Lower PK + cls.opinion_low_relevance.pk, # 2022-02-23 Higher PK in V4, API pk is a secondary sorting key. + cls.opinion_high_relevance.pk, # 2022-02-23 Lower PK cls.opinion_high_relevance_old_date.pk, # 1800-02-23 cls.opinion_old.pk, # 1732-02-23 ], - "expected_order_v3": [ + "expected_order_v3": [ # V3 API cls.opinion_low_relevance_new_date.pk, # 2024-12-23 cls.opinion_recent.pk, # 2024-02-23 - cls.opinion_high_relevance.pk, # 2022-02-23 Indexed first + cls.opinion_high_relevance.pk, # 2022-02-23 Indexed first, displayed first. cls.opinion_low_relevance.pk, # 2022-02-23 cls.opinion_high_relevance_old_date.pk, # 1800-02-23 cls.opinion_old.pk, # 1732-02-23 diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index 6144cbf9ea..7dd4b4647f 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -2499,7 +2499,8 @@ class OralArgumentsSearchDecayRelevancyTest( @classmethod def setUpTestData(cls): - # Same keywords but different date_argued + + # Same keywords but different dateArgued with cls.captureOnCommitCallbacks(execute=True): cls.docket_old = DocketFactory.create( docket_number="1:21-bk-1235", @@ -2539,7 +2540,7 @@ def setUpTestData(cls): stt_transcript="Transcript for recent audio", ) - # Different relevance with same date_argued + # Different relevance with same dateArgued cls.docket_low_relevance = DocketFactory.create( case_name="Highly Relevant Keywords", docket_number="1:21-bk-1238", @@ -2577,11 +2578,10 @@ def setUpTestData(cls): blocked=False, sha1="high_rel_sha1", stt_status=Audio.STT_COMPLETE, - # More relevancy can be indicated by adding more relevant keywords in transcript stt_transcript="More Highly Relevant Keywords in the transcript", ) - # Different relevance with different date_argued + # Different relevance with different dateArgued cls.docket_high_relevance_old_date = DocketFactory.create( case_name="Ipsum Dolor Terms", docket_number="1:21-bk-1239", @@ -2644,23 +2644,23 @@ def setUpTestData(cls): cls.test_cases = [ { - "name": "Same keywords, order by score desc", + "name": "Same keywords different dateArgued", "search_params": { "q": "Keyword Match", "order_by": "score desc", "type": SEARCH_TYPES.ORAL_ARGUMENT, }, "expected_order_frontend": [ - cls.docket_recent.docket_number, # Most recent date_argued - cls.docket_old.docket_number, # Oldest date_argued + cls.docket_recent.docket_number, # Most recent dateArgued + cls.docket_old.docket_number, # Oldest dateArgued ], - "expected_order": [ + "expected_order": [ # API cls.audio_recent.pk, cls.audio_old.pk, ], }, { - "name": "Different relevancy same dateArgued, order by score desc", + "name": "Different relevancy same dateArgued", "search_params": { "q": "Highly Relevant Keywords", "order_by": "score desc", @@ -2676,36 +2676,36 @@ def setUpTestData(cls): ], }, { - "name": "Different relevancy different dateArgued, order by score desc", + "name": "Different relevancy different dateArgued", "search_params": { "q": "Ipsum Dolor Terms", "order_by": "score desc", "type": SEARCH_TYPES.ORAL_ARGUMENT, }, "expected_order_frontend": [ - cls.docket_low_relevance_new_date.docket_number, + cls.docket_low_relevance_new_date.docket_number, # Combination of relevance and date rank it first. cls.docket_high_relevance_old_date.docket_number, - cls.docket_high_relevance_null_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. ], - "expected_order": [ + "expected_order": [ # API cls.audio_low_relevance_new_date.pk, cls.audio_high_relevance_old_date.pk, cls.audio_high_relevance_null_date.pk, ], }, { - "name": "Fixed main score (Filtering) different dateArgued, order by score desc", + "name": "Fixed main score for all (0 or 1) (using filters) and different dateArgued", "search_params": { "case_name": "Ipsum Dolor Terms", "order_by": "score desc", "type": SEARCH_TYPES.ORAL_ARGUMENT, }, "expected_order_frontend": [ - cls.docket_low_relevance_new_date.docket_number, + cls.docket_low_relevance_new_date.docket_number, # Most recent dateFiled cls.docket_high_relevance_old_date.docket_number, - cls.docket_high_relevance_null_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. ], - "expected_order": [ + "expected_order": [ # API cls.audio_low_relevance_new_date.pk, cls.audio_high_relevance_old_date.pk, cls.audio_high_relevance_null_date.pk, @@ -2721,29 +2721,29 @@ def setUpTestData(cls): "expected_order_frontend": [ cls.docket_low_relevance_new_date.docket_number, # 2024-12-23 1:21-bk-1241 cls.docket_recent.docket_number, # 2024-02-23 1:21-bk-1236 - cls.docket_low_relevance.docket_number, # 2022-02-23 1:21-bk-1238 + cls.docket_low_relevance.docket_number, # 2022-02-23 1:21-bk-1238 Indexed first, displayed first. cls.docket_high_relevance.docket_number, # 2022-02-23 1:21-bk-1237 cls.docket_high_relevance_old_date.docket_number, # 1800-02-23 1:21-bk-1239 cls.docket_old.docket_number, # 1732-02-23 1:21-bk-1235 - cls.docket_high_relevance_null_date.docket_number, # Null date 1:21-bk-1240 + cls.docket_high_relevance_null_date.docket_number, # Null dateArgued 1:21-bk-1240 ], - "expected_order": [ - cls.audio_low_relevance_new_date.pk, - cls.audio_recent.pk, - cls.audio_high_relevance.pk, - cls.audio_low_relevance.pk, - cls.audio_high_relevance_old_date.pk, - cls.audio_old.pk, - cls.audio_high_relevance_null_date.pk, + "expected_order": [ # V4 API + cls.audio_low_relevance_new_date.pk, # 2024-12-23 + cls.audio_recent.pk, # 2024-02-23 + cls.audio_high_relevance.pk, # 2022-02-23 Higher PK in V4 API, pk is a secondary sorting key. + cls.audio_low_relevance.pk, # 2022-02-23 + cls.audio_high_relevance_old_date.pk, # 1800-02-23 + cls.audio_old.pk, # 1732-02-23 + cls.audio_high_relevance_null_date.pk, # Null dateArgued ], - "expected_order_v3": [ - cls.audio_low_relevance_new_date.pk, - cls.audio_recent.pk, - cls.audio_low_relevance.pk, - cls.audio_high_relevance.pk, - cls.audio_high_relevance_old_date.pk, - cls.audio_old.pk, - cls.audio_high_relevance_null_date.pk, + "expected_order_v3": [ # V3 API + cls.audio_low_relevance_new_date.pk, # 2024-12-23 + cls.audio_recent.pk, # 2024-02-23 + cls.audio_low_relevance.pk, # 2022-02-23 Indexed first, displayed first. + cls.audio_high_relevance.pk, # 2022-02-23 + cls.audio_high_relevance_old_date.pk, # 1800-02-23 + cls.audio_old.pk, # 1732-02-23 + cls.audio_high_relevance_null_date.pk, # Null dateArgued ], }, ] diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 4b504ede84..3cd5a008dd 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -3020,7 +3020,7 @@ def setUpTestData(cls): cls.test_cases = [ { - "name": "Same keywords, order by score desc", + "name": "Same keywords, different dateFiled", "search_params": { "q": "Keyword Match", "order_by": "score desc", @@ -3030,13 +3030,13 @@ def setUpTestData(cls): cls.docket_recent.docket_number, # Most recent dateFiled cls.docket_old.docket_number, # Oldest dateFiled ], - "expected_order": [ - cls.docket_recent.pk, # Most recent dateFiled - cls.docket_old.pk, # Oldest dateFiled + "expected_order": [ # API + cls.docket_recent.pk, + cls.docket_old.pk, ], }, { - "name": "Different relevancy same dateFiled, order by score desc", + "name": "Different relevancy same dateFiled", "search_params": { "q": "Highly Relevant Keywords", "order_by": "score desc", @@ -3048,44 +3048,42 @@ def setUpTestData(cls): cls.docket_low_relevance.docket_number, # Less relevant by keywords ], - "expected_order": [ + "expected_order": [ # API cls.docket_high_relevance.pk, - # Most relevant by keywords cls.docket_low_relevance.pk, - # Less relevant by keywords ], }, { - "name": "Different relevancy different dateFiled, order by score desc", + "name": "Different relevancy different dateFiled", "search_params": { "q": "Ipsum Dolor Terms", "order_by": "score desc", "type": SEARCH_TYPES.RECAP, }, "expected_order_frontend": [ - cls.docket_low_relevance_new_date.docket_number, + cls.docket_low_relevance_new_date.docket_number, # Combination of relevance and date rank it first. cls.docket_high_relevance_old_date.docket_number, - cls.docket_high_relevance_null_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. ], - "expected_order": [ + "expected_order": [ # API cls.docket_low_relevance_new_date.pk, cls.docket_high_relevance_old_date.pk, cls.docket_high_relevance_null_date.pk, ], }, { - "name": "Fixed main score (Filtering) different dateFiled, order by score desc", + "name": "Fixed main score for all (0 or 1) (using filters) and different dateFiled", "search_params": { "case_name": "Ipsum Dolor Terms", "order_by": "score desc", "type": SEARCH_TYPES.RECAP, }, "expected_order_frontend": [ - cls.docket_low_relevance_new_date.docket_number, + cls.docket_low_relevance_new_date.docket_number, # Most recent dateFiled cls.docket_high_relevance_old_date.docket_number, - cls.docket_high_relevance_null_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. ], - "expected_order": [ + "expected_order": [ # API cls.docket_low_relevance_new_date.pk, cls.docket_high_relevance_old_date.pk, cls.docket_high_relevance_null_date.pk, @@ -3104,37 +3102,37 @@ def setUpTestData(cls): cls.docket_recent.docket_number, # 2024, 2, 23 1:21-bk-1236 cls.docket_low_relevance.docket_number, - # 2022, 2, 23 1:21-bk-1238 + # 2022, 2, 23 1:21-bk-1238 Indexed first, displayed first. cls.docket_high_relevance.docket_number, # 2022, 2, 23 1:21-bk-1237 cls.docket_high_relevance_old_date.docket_number, # 1800, 2, 23 1:21-bk-1239 cls.docket_old.docket_number, # 1732, 2, 23 1:21-bk-1235 cls.docket_high_relevance_null_date.docket_number, - # Null 1:21-bk-1240 + # Null dateFiled 1:21-bk-1240 ], - "expected_order": [ + "expected_order": [ # V4 API cls.docket_low_relevance_new_date.pk, # 2024, 12, 23 1:21-bk-1241 cls.docket_recent.pk, # 2024, 2, 23 1:21-bk-1236 cls.docket_high_relevance.pk, - # 2022, 2, 23 1:21-bk-1237 Greater PK + # 2022, 2, 23 1:21-bk-1237 Higher PK in V4, API pk is a secondary sorting key. cls.docket_low_relevance.pk, - # 2022, 2, 23 1:21-bk-1238 + # 2022, 2, 23 1:21-bk-1238 Lower PK cls.docket_high_relevance_old_date.pk, # 1800, 2, 23 1:21-bk-1239 cls.docket_old.pk, # 1732, 2, 23 1:21-bk-1235 cls.docket_high_relevance_null_date.pk, # Null 1:21-bk-1240 ], - "expected_order_v3": [ + "expected_order_v3": [ # V3 API cls.docket_low_relevance_new_date.pk, # 2024, 12, 23 1:21-bk-1241 cls.docket_recent.pk, # 2024, 2, 23 1:21-bk-1236 cls.docket_low_relevance.pk, - # 2022, 2, 23 1:21-bk-1238 Indexed first than 1:21-bk-1237 + # 2022, 2, 23 1:21-bk-1238 Indexed first, displayed first. cls.docket_high_relevance.pk, # 2022, 2, 23 1:21-bk-1237 cls.docket_high_relevance_old_date.pk, @@ -3718,28 +3716,6 @@ async def test_results_ordering(self) -> None: # API await self._test_api_results_count(params, 3, "order random") - # Order by score desc (relevance). - params = { - "type": SEARCH_TYPES.RECAP, - "q": "SUBPOENAS SERVED", - "order_by": "score desc", - } - # API - r = await self._test_api_results_count(params, 3, "order score desc") - self.assertTrue( - r.content.decode().index("12-1235") # 2016, 8, 16 - < r.content.decode().index("1:21-bk-1234"), # 2015, 8, 16 - msg="'12-1235' should come BEFORE '1:21-bk-1234' when order_by score desc.", - ) - - params["type"] = SEARCH_TYPES.DOCKETS - r = await self._test_api_results_count(params, 2, "order") - self.assertTrue( - r.content.decode().index("12-1235") # 2016, 8, 16 - < r.content.decode().index("1:21-bk-1234"), # 2015, 8, 16 - msg="'12-1235' should come BEFORE '1:21-bk-1234' when order_by score desc.", - ) - # Order by entry_date_filed desc params = { "type": SEARCH_TYPES.RECAP,