Skip to content

Commit

Permalink
Refactor legacy search query processing (#57)
Browse files Browse the repository at this point in the history
* Refactor legacy search query processing

Refactor LegacySearchQueryMapper to handle logical OR and punctuation in search terms, removing dependency on FilterProcessor. Update ConceptFilterQueryGenerator with generateLegacyFilterQuery and simplify query construction. Adjust LegacySearchRepository to align with new query generation logic.

* Add test for handling OR logic in legacy search

This commit introduces a new test, `shouldHandleORRequest`, to verify the OR logic within the legacy search functionality. It ensures that the query with an OR condition is processed correctly, and the results are returned as expected without errors.

* Refactor search term handling in legacy search.

Refactored formatting for compactness in legacySearch method. Added logging in LegacySearchQueryMapper to debug search term construction. Enhanced integration test to verify expanded search results with OR queries.
  • Loading branch information
Gcolon021 authored Nov 19, 2024
1 parent 89d5728 commit 5cf4e51
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 43 deletions.
3 changes: 2 additions & 1 deletion dictonaryReqeust.http
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ Content-Type: application/json
POST http://localhost:80/search
Content-Type: application/json

{"@type":"GeneralQueryRequest","resourceCredentials":{},"query":{"searchTerm":"breast","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":10000000},"resourceUUID":null}
{"@type":"GeneralQueryRequest","resourceCredentials":{},"query":{"searchTerm":"throat sore acute #8","includedTags":[],
"excludedTags":[],"returnTags":"true","offset":0,"limit":100000},"resourceUUID":null}
Original file line number Diff line number Diff line change
Expand Up @@ -82,37 +82,7 @@ public QueryParamPair generateFilterQuery(Filter filter, Pageable pageable) {
params.addValue("consents", filter.consents());
}
clauses.add(createValuelessNodeFilter(filter.search(), filter.consents()));


String query = "(\n" + String.join("\n\tINTERSECT\n", clauses) + "\n)";
String superQuery = """
WITH q AS (
%s
)
%s
SELECT q.concept_node_id AS concept_node_id, max((1 + rank) * coalesce(rank_adjustment, 1)) AS rank
FROM q
LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id
GROUP BY q.concept_node_id
ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC
""".formatted(query, RANK_ADJUSTMENTS);
// explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC
// you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last
// coalesce will return the first non null value; this solves rows that aren't marked as filterable or not
// I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var
// is placed below a rank value of 0 for a filterable var
// Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird

if (pageable.isPaged()) {
superQuery = superQuery + """
LIMIT :limit
OFFSET :offset
""";
params.addValue("limit", pageable.getPageSize()).addValue("offset", pageable.getOffset());
}

superQuery = " concepts_filtered_sorted AS (\n" + superQuery + "\n)";

String superQuery = getSuperQuery(pageable, clauses, params);

return new QueryParamPair(superQuery, params);
}
Expand Down Expand Up @@ -184,4 +154,78 @@ facet.name IN (:facets_for_category_%s ) AND facet_category.name = :category_%s
}).toList();
}

public QueryParamPair generateLegacyFilterQuery(Filter filter, Pageable pageable) {
MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("disallowed_meta_keys", disallowedMetaFields);
List<String> clauses = new java.util.ArrayList<>(List.of());
if (StringUtils.hasLength(filter.search())) {
params.addValue("dynamic_tsquery", filter.search().trim());
}
clauses.add(createDynamicValuelessNodeFilter(filter.search()));
String superQuery = getSuperQuery(pageable, clauses, params);

return new QueryParamPair(superQuery, params);
}

private String createDynamicValuelessNodeFilter(String search) {
String rankQuery = "0 as rank";
String rankWhere = "";
if (StringUtils.hasLength(search)) {
rankQuery = "ts_rank(searchable_fields, to_tsquery(:dynamic_tsquery)) AS rank";
rankWhere = "concept_node.searchable_fields @@ to_tsquery(:dynamic_tsquery) AND";
}
return """
SELECT
concept_node.concept_node_id,
%s
FROM
concept_node
LEFT JOIN dataset ON concept_node.dataset_id = dataset.dataset_id
LEFT JOIN concept_node_meta AS continuous_min ON concept_node.concept_node_id = continuous_min.concept_node_id AND continuous_min.KEY = 'min'
LEFT JOIN concept_node_meta AS continuous_max ON concept_node.concept_node_id = continuous_max.concept_node_id AND continuous_max.KEY = 'max'
LEFT JOIN concept_node_meta AS categorical_values ON concept_node.concept_node_id = categorical_values.concept_node_id AND categorical_values.KEY = 'values'
WHERE
%s
%s
(
continuous_min.value <> '' OR
continuous_max.value <> '' OR
categorical_values.value <> ''
)
"""
.formatted(rankQuery, rankWhere, "");
}

private static String getSuperQuery(Pageable pageable, List<String> clauses, MapSqlParameterSource params) {
String query = "(\n" + String.join("\n\tINTERSECT\n", clauses) + "\n)";
String superQuery = """
WITH q AS (
%s
)
%s
SELECT q.concept_node_id AS concept_node_id, max((1 + rank) * coalesce(rank_adjustment, 1)) AS rank
FROM q
LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id
GROUP BY q.concept_node_id
ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC
""".formatted(query, RANK_ADJUSTMENTS);
// explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC
// you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last
// coalesce will return the first non null value; this solves rows that aren't marked as filterable or not
// I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var
// is placed below a rank value of 0 for a filterable var
// Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird

if (pageable.isPaged()) {
superQuery = superQuery + """
LIMIT :limit
OFFSET :offset
""";
params.addValue("limit", pageable.getPageSize()).addValue("offset", pageable.getOffset());
}

superQuery = " concepts_filtered_sorted AS (\n" + superQuery + "\n)";
return superQuery;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,59 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import edu.harvard.dbmi.avillach.dictionary.filter.Filter;
import edu.harvard.dbmi.avillach.dictionary.filter.FilterProcessor;
import edu.harvard.dbmi.avillach.dictionary.legacysearch.model.LegacySearchQuery;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

@Component
public class LegacySearchQueryMapper {

private static final ObjectMapper objectMapper = new ObjectMapper();
private final FilterProcessor filterProcessor;
private static final Logger log = LoggerFactory.getLogger(LegacySearchQueryMapper.class);

public LegacySearchQueryMapper(FilterProcessor filterProcessor) {
this.filterProcessor = filterProcessor;
}
public LegacySearchQueryMapper() {}

public LegacySearchQuery mapFromJson(String jsonString) throws IOException {
JsonNode rootNode = objectMapper.readTree(jsonString);
JsonNode queryNode = rootNode.get("query");

String searchTerm = queryNode.get("searchTerm").asText();
int limit = queryNode.get("limit").asInt();
Filter filter = filterProcessor.processsFilter(new Filter(List.of(), searchTerm, List.of()));
return new LegacySearchQuery(filter, PageRequest.of(0, limit));
searchTerm = constructTsQuery(searchTerm);
log.debug("Constructed Search Term: {}", searchTerm);
return new LegacySearchQuery(new Filter(List.of(), searchTerm, List.of()), PageRequest.of(0, limit));
}

// An attempt to provide OR search that will produce similar results to legacy search-prototype
private String constructTsQuery(String searchTerm) {
// Split on the | to enable or queries
String[] orGroups = searchTerm.split("\\|");
List<String> orClauses = new ArrayList<>();

for (String group : orGroups) {
// To replicate legacy search we will split using its regex [\\s\\p{Punct}]+
String[] tokens = group.trim().split("[\\s\\p{Punct}]+");

// Now we will combine the tokens in this group and '&' them together.
String andClause = Arrays.stream(tokens).filter(token -> !token.isBlank()) // remove empty tokens.
.map(token -> token + ":*") // add the wild card for search
.collect(Collectors.joining(" & "));

if (!andClause.isBlank()) {
orClauses.add(andClause);
}
}


return String.join(" | ", orClauses);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public LegacySearchRepository(
}

public List<SearchResult> getLegacySearchResults(Filter filter, Pageable pageable) {
QueryParamPair filterQ = filterGen.generateFilterQuery(filter, pageable);
QueryParamPair filterQ = filterGen.generateLegacyFilterQuery(filter, pageable);
String sql = ALLOW_FILTERING_Q + ", " + filterQ.query() + """
SELECT concept_node.concept_path AS conceptPath,
concept_node.display AS display,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,38 @@ void shouldGetLegacyResponseByStudyID() throws IOException {
searchResults.forEach(searchResult -> Assertions.assertEquals("phs000007", searchResult.result().studyId()));
}

@Test
void shouldHandleORRequest() throws IOException {
String jsonString = """
{"query":{"searchTerm":"age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

ResponseEntity<LegacyResponse> legacyResponseResponseEntity = legacySearchController.legacySearch(jsonString);
Assertions.assertEquals(HttpStatus.OK, legacyResponseResponseEntity.getStatusCode());
LegacyResponse legacyResponseBody = legacyResponseResponseEntity.getBody();
Assertions.assertNotNull(legacyResponseBody);
Results results = legacyResponseBody.results();
List<SearchResult> ageSearchResults = results.searchResults();
Assertions.assertEquals(4, ageSearchResults.size());

jsonString = """
{"query":{"searchTerm":"physical|age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

legacyResponseResponseEntity = legacySearchController.legacySearch(jsonString);
Assertions.assertEquals(HttpStatus.OK, legacyResponseResponseEntity.getStatusCode());
legacyResponseBody = legacyResponseResponseEntity.getBody();
Assertions.assertNotNull(legacyResponseBody);
results = legacyResponseBody.results();
List<SearchResult> physicalORAgeSearchResults = results.searchResults();
Assertions.assertEquals(5, physicalORAgeSearchResults.size());

// Verify that age|physical has expanded the search results
Assertions.assertNotEquals(ageSearchResults.size(), physicalORAgeSearchResults.size());

// Verify the OR statement has more results
Assertions.assertTrue(ageSearchResults.size() < physicalORAgeSearchResults.size());
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ void shouldParseSearchRequest() throws IOException {
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("age", filter.search());
Assertions.assertEquals("age:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

@Test
void shouldReplaceUnderscore() throws IOException {
void shouldHandlePunct() throws IOException {
String jsonString =
"""
{"query":{"searchTerm":"tutorial-biolincc_digitalis","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
Expand All @@ -43,7 +43,35 @@ void shouldReplaceUnderscore() throws IOException {
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("tutorial-biolincc/digitalis", filter.search());
Assertions.assertEquals("tutorial:* & biolincc:* & digitalis:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

@Test
void shouldHandleOR() throws IOException {
String jsonString = """
{"query":{"searchTerm":"sex|gender","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

LegacySearchQuery legacySearchQuery = legacySearchQueryMapper.mapFromJson(jsonString);
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("sex:* | gender:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

@Test
void shouldHandleORAndPunct() throws IOException {
String jsonString = """
{"query":{"searchTerm":"sex|gender age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

LegacySearchQuery legacySearchQuery = legacySearchQueryMapper.mapFromJson(jsonString);
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("sex:* | gender:* & age:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

Expand Down

0 comments on commit 5cf4e51

Please sign in to comment.