From 5cf4e510890ba69a27d0b2cc26e2332a211cab7a Mon Sep 17 00:00:00 2001 From: Gcolon021 <34667267+Gcolon021@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:06:50 -0500 Subject: [PATCH] Refactor legacy search query processing (#57) * Refactor legacy search query processing Refactor LegacySearchQueryMapper to handle logical OR and punctuation in search terms, removing dependency on FilterProcessor. Update ConceptFilterQueryGenerator with generateLegacyFilterQuery and simplify query construction. Adjust LegacySearchRepository to align with new query generation logic. * Add test for handling OR logic in legacy search This commit introduces a new test, `shouldHandleORRequest`, to verify the OR logic within the legacy search functionality. It ensures that the query with an OR condition is processed correctly, and the results are returned as expected without errors. * Refactor search term handling in legacy search. Refactored formatting for compactness in legacySearch method. Added logging in LegacySearchQueryMapper to debug search term construction. Enhanced integration test to verify expanded search results with OR queries. --- dictonaryReqeust.http | 3 +- .../concept/ConceptFilterQueryGenerator.java | 106 +++++++++++++----- .../legacysearch/LegacySearchQueryMapper.java | 41 +++++-- .../legacysearch/LegacySearchRepository.java | 2 +- ...LegacySearchControllerIntegrationTest.java | 34 ++++++ .../LegacySearchQueryMapperTest.java | 34 +++++- 6 files changed, 177 insertions(+), 43 deletions(-) diff --git a/dictonaryReqeust.http b/dictonaryReqeust.http index e474278..ae53188 100644 --- a/dictonaryReqeust.http +++ b/dictonaryReqeust.http @@ -12,4 +12,5 @@ Content-Type: application/json POST http://localhost:80/search Content-Type: application/json -{"@type":"GeneralQueryRequest","resourceCredentials":{},"query":{"searchTerm":"breast","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":10000000},"resourceUUID":null} \ No newline at end of file +{"@type":"GeneralQueryRequest","resourceCredentials":{},"query":{"searchTerm":"throat sore acute #8","includedTags":[], + "excludedTags":[],"returnTags":"true","offset":0,"limit":100000},"resourceUUID":null} diff --git a/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java b/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java index 5388c40..fd35437 100644 --- a/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java +++ b/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java @@ -82,37 +82,7 @@ public QueryParamPair generateFilterQuery(Filter filter, Pageable pageable) { params.addValue("consents", filter.consents()); } clauses.add(createValuelessNodeFilter(filter.search(), filter.consents())); - - - String query = "(\n" + String.join("\n\tINTERSECT\n", clauses) + "\n)"; - String superQuery = """ - WITH q AS ( - %s - ) - %s - SELECT q.concept_node_id AS concept_node_id, max((1 + rank) * coalesce(rank_adjustment, 1)) AS rank - FROM q - LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id - GROUP BY q.concept_node_id - ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC - """.formatted(query, RANK_ADJUSTMENTS); - // explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC - // you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last - // coalesce will return the first non null value; this solves rows that aren't marked as filterable or not - // I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var - // is placed below a rank value of 0 for a filterable var - // Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird - - if (pageable.isPaged()) { - superQuery = superQuery + """ - LIMIT :limit - OFFSET :offset - """; - params.addValue("limit", pageable.getPageSize()).addValue("offset", pageable.getOffset()); - } - - superQuery = " concepts_filtered_sorted AS (\n" + superQuery + "\n)"; - + String superQuery = getSuperQuery(pageable, clauses, params); return new QueryParamPair(superQuery, params); } @@ -184,4 +154,78 @@ facet.name IN (:facets_for_category_%s ) AND facet_category.name = :category_%s }).toList(); } + public QueryParamPair generateLegacyFilterQuery(Filter filter, Pageable pageable) { + MapSqlParameterSource params = new MapSqlParameterSource(); + params.addValue("disallowed_meta_keys", disallowedMetaFields); + List clauses = new java.util.ArrayList<>(List.of()); + if (StringUtils.hasLength(filter.search())) { + params.addValue("dynamic_tsquery", filter.search().trim()); + } + clauses.add(createDynamicValuelessNodeFilter(filter.search())); + String superQuery = getSuperQuery(pageable, clauses, params); + + return new QueryParamPair(superQuery, params); + } + + private String createDynamicValuelessNodeFilter(String search) { + String rankQuery = "0 as rank"; + String rankWhere = ""; + if (StringUtils.hasLength(search)) { + rankQuery = "ts_rank(searchable_fields, to_tsquery(:dynamic_tsquery)) AS rank"; + rankWhere = "concept_node.searchable_fields @@ to_tsquery(:dynamic_tsquery) AND"; + } + return """ + SELECT + concept_node.concept_node_id, + %s + FROM + concept_node + LEFT JOIN dataset ON concept_node.dataset_id = dataset.dataset_id + LEFT JOIN concept_node_meta AS continuous_min ON concept_node.concept_node_id = continuous_min.concept_node_id AND continuous_min.KEY = 'min' + LEFT JOIN concept_node_meta AS continuous_max ON concept_node.concept_node_id = continuous_max.concept_node_id AND continuous_max.KEY = 'max' + LEFT JOIN concept_node_meta AS categorical_values ON concept_node.concept_node_id = categorical_values.concept_node_id AND categorical_values.KEY = 'values' + WHERE + %s + %s + ( + continuous_min.value <> '' OR + continuous_max.value <> '' OR + categorical_values.value <> '' + ) + """ + .formatted(rankQuery, rankWhere, ""); + } + + private static String getSuperQuery(Pageable pageable, List clauses, MapSqlParameterSource params) { + String query = "(\n" + String.join("\n\tINTERSECT\n", clauses) + "\n)"; + String superQuery = """ + WITH q AS ( + %s + ) + %s + SELECT q.concept_node_id AS concept_node_id, max((1 + rank) * coalesce(rank_adjustment, 1)) AS rank + FROM q + LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id + GROUP BY q.concept_node_id + ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC + """.formatted(query, RANK_ADJUSTMENTS); + // explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC + // you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last + // coalesce will return the first non null value; this solves rows that aren't marked as filterable or not + // I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var + // is placed below a rank value of 0 for a filterable var + // Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird + + if (pageable.isPaged()) { + superQuery = superQuery + """ + LIMIT :limit + OFFSET :offset + """; + params.addValue("limit", pageable.getPageSize()).addValue("offset", pageable.getOffset()); + } + + superQuery = " concepts_filtered_sorted AS (\n" + superQuery + "\n)"; + return superQuery; + } + } diff --git a/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapper.java b/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapper.java index 001436c..8a23d3c 100644 --- a/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapper.java +++ b/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapper.java @@ -3,23 +3,25 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import edu.harvard.dbmi.avillach.dictionary.filter.Filter; -import edu.harvard.dbmi.avillach.dictionary.filter.FilterProcessor; import edu.harvard.dbmi.avillach.dictionary.legacysearch.model.LegacySearchQuery; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.data.domain.PageRequest; import org.springframework.stereotype.Component; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; @Component public class LegacySearchQueryMapper { private static final ObjectMapper objectMapper = new ObjectMapper(); - private final FilterProcessor filterProcessor; + private static final Logger log = LoggerFactory.getLogger(LegacySearchQueryMapper.class); - public LegacySearchQueryMapper(FilterProcessor filterProcessor) { - this.filterProcessor = filterProcessor; - } + public LegacySearchQueryMapper() {} public LegacySearchQuery mapFromJson(String jsonString) throws IOException { JsonNode rootNode = objectMapper.readTree(jsonString); @@ -27,8 +29,33 @@ public LegacySearchQuery mapFromJson(String jsonString) throws IOException { String searchTerm = queryNode.get("searchTerm").asText(); int limit = queryNode.get("limit").asInt(); - Filter filter = filterProcessor.processsFilter(new Filter(List.of(), searchTerm, List.of())); - return new LegacySearchQuery(filter, PageRequest.of(0, limit)); + searchTerm = constructTsQuery(searchTerm); + log.debug("Constructed Search Term: {}", searchTerm); + return new LegacySearchQuery(new Filter(List.of(), searchTerm, List.of()), PageRequest.of(0, limit)); + } + + // An attempt to provide OR search that will produce similar results to legacy search-prototype + private String constructTsQuery(String searchTerm) { + // Split on the | to enable or queries + String[] orGroups = searchTerm.split("\\|"); + List orClauses = new ArrayList<>(); + + for (String group : orGroups) { + // To replicate legacy search we will split using its regex [\\s\\p{Punct}]+ + String[] tokens = group.trim().split("[\\s\\p{Punct}]+"); + + // Now we will combine the tokens in this group and '&' them together. + String andClause = Arrays.stream(tokens).filter(token -> !token.isBlank()) // remove empty tokens. + .map(token -> token + ":*") // add the wild card for search + .collect(Collectors.joining(" & ")); + + if (!andClause.isBlank()) { + orClauses.add(andClause); + } + } + + + return String.join(" | ", orClauses); } } diff --git a/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchRepository.java b/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchRepository.java index 9ea09c4..4ecdea3 100644 --- a/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchRepository.java +++ b/src/main/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchRepository.java @@ -35,7 +35,7 @@ public LegacySearchRepository( } public List getLegacySearchResults(Filter filter, Pageable pageable) { - QueryParamPair filterQ = filterGen.generateFilterQuery(filter, pageable); + QueryParamPair filterQ = filterGen.generateLegacyFilterQuery(filter, pageable); String sql = ALLOW_FILTERING_Q + ", " + filterQ.query() + """ SELECT concept_node.concept_path AS conceptPath, concept_node.display AS display, diff --git a/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchControllerIntegrationTest.java b/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchControllerIntegrationTest.java index d68a4f4..b939684 100644 --- a/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchControllerIntegrationTest.java +++ b/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchControllerIntegrationTest.java @@ -54,4 +54,38 @@ void shouldGetLegacyResponseByStudyID() throws IOException { searchResults.forEach(searchResult -> Assertions.assertEquals("phs000007", searchResult.result().studyId())); } + @Test + void shouldHandleORRequest() throws IOException { + String jsonString = """ + {"query":{"searchTerm":"age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}} + """; + + ResponseEntity legacyResponseResponseEntity = legacySearchController.legacySearch(jsonString); + Assertions.assertEquals(HttpStatus.OK, legacyResponseResponseEntity.getStatusCode()); + LegacyResponse legacyResponseBody = legacyResponseResponseEntity.getBody(); + Assertions.assertNotNull(legacyResponseBody); + Results results = legacyResponseBody.results(); + List ageSearchResults = results.searchResults(); + Assertions.assertEquals(4, ageSearchResults.size()); + + jsonString = """ + {"query":{"searchTerm":"physical|age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}} + """; + + legacyResponseResponseEntity = legacySearchController.legacySearch(jsonString); + Assertions.assertEquals(HttpStatus.OK, legacyResponseResponseEntity.getStatusCode()); + legacyResponseBody = legacyResponseResponseEntity.getBody(); + Assertions.assertNotNull(legacyResponseBody); + results = legacyResponseBody.results(); + List physicalORAgeSearchResults = results.searchResults(); + Assertions.assertEquals(5, physicalORAgeSearchResults.size()); + + // Verify that age|physical has expanded the search results + Assertions.assertNotEquals(ageSearchResults.size(), physicalORAgeSearchResults.size()); + + // Verify the OR statement has more results + Assertions.assertTrue(ageSearchResults.size() < physicalORAgeSearchResults.size()); + } + + } diff --git a/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapperTest.java b/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapperTest.java index e0661ab..6c8cfa2 100644 --- a/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapperTest.java +++ b/src/test/java/edu/harvard/dbmi/avillach/dictionary/legacysearch/LegacySearchQueryMapperTest.java @@ -28,12 +28,12 @@ void shouldParseSearchRequest() throws IOException { Filter filter = legacySearchQuery.filter(); Pageable pageable = legacySearchQuery.pageable(); - Assertions.assertEquals("age", filter.search()); + Assertions.assertEquals("age:*", filter.search()); Assertions.assertEquals(100, pageable.getPageSize()); } @Test - void shouldReplaceUnderscore() throws IOException { + void shouldHandlePunct() throws IOException { String jsonString = """ {"query":{"searchTerm":"tutorial-biolincc_digitalis","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}} @@ -43,7 +43,35 @@ void shouldReplaceUnderscore() throws IOException { Filter filter = legacySearchQuery.filter(); Pageable pageable = legacySearchQuery.pageable(); - Assertions.assertEquals("tutorial-biolincc/digitalis", filter.search()); + Assertions.assertEquals("tutorial:* & biolincc:* & digitalis:*", filter.search()); + Assertions.assertEquals(100, pageable.getPageSize()); + } + + @Test + void shouldHandleOR() throws IOException { + String jsonString = """ + {"query":{"searchTerm":"sex|gender","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}} + """; + + LegacySearchQuery legacySearchQuery = legacySearchQueryMapper.mapFromJson(jsonString); + Filter filter = legacySearchQuery.filter(); + Pageable pageable = legacySearchQuery.pageable(); + + Assertions.assertEquals("sex:* | gender:*", filter.search()); + Assertions.assertEquals(100, pageable.getPageSize()); + } + + @Test + void shouldHandleORAndPunct() throws IOException { + String jsonString = """ + {"query":{"searchTerm":"sex|gender age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}} + """; + + LegacySearchQuery legacySearchQuery = legacySearchQueryMapper.mapFromJson(jsonString); + Filter filter = legacySearchQuery.filter(); + Pageable pageable = legacySearchQuery.pageable(); + + Assertions.assertEquals("sex:* | gender:* & age:*", filter.search()); Assertions.assertEquals(100, pageable.getPageSize()); }