Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor legacy search query processing #57

Merged
merged 4 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dictonaryReqeust.http
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ Content-Type: application/json
POST http://localhost:80/search
Content-Type: application/json

{"@type":"GeneralQueryRequest","resourceCredentials":{},"query":{"searchTerm":"breast","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":10000000},"resourceUUID":null}
{"@type":"GeneralQueryRequest","resourceCredentials":{},"query":{"searchTerm":"throat sore acute #8","includedTags":[],
"excludedTags":[],"returnTags":"true","offset":0,"limit":100000},"resourceUUID":null}
Original file line number Diff line number Diff line change
Expand Up @@ -82,37 +82,7 @@ public QueryParamPair generateFilterQuery(Filter filter, Pageable pageable) {
params.addValue("consents", filter.consents());
}
clauses.add(createValuelessNodeFilter(filter.search(), filter.consents()));


String query = "(\n" + String.join("\n\tINTERSECT\n", clauses) + "\n)";
String superQuery = """
WITH q AS (
%s
)
%s
SELECT q.concept_node_id AS concept_node_id, max((1 + rank) * coalesce(rank_adjustment, 1)) AS rank
FROM q
LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id
GROUP BY q.concept_node_id
ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC
""".formatted(query, RANK_ADJUSTMENTS);
// explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC
// you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last
// coalesce will return the first non null value; this solves rows that aren't marked as filterable or not
// I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var
// is placed below a rank value of 0 for a filterable var
// Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird

if (pageable.isPaged()) {
superQuery = superQuery + """
LIMIT :limit
OFFSET :offset
""";
params.addValue("limit", pageable.getPageSize()).addValue("offset", pageable.getOffset());
}

superQuery = " concepts_filtered_sorted AS (\n" + superQuery + "\n)";

String superQuery = getSuperQuery(pageable, clauses, params);

return new QueryParamPair(superQuery, params);
}
Expand Down Expand Up @@ -184,4 +154,78 @@ facet.name IN (:facets_for_category_%s ) AND facet_category.name = :category_%s
}).toList();
}

public QueryParamPair generateLegacyFilterQuery(Filter filter, Pageable pageable) {
MapSqlParameterSource params = new MapSqlParameterSource();
params.addValue("disallowed_meta_keys", disallowedMetaFields);
List<String> clauses = new java.util.ArrayList<>(List.of());
if (StringUtils.hasLength(filter.search())) {
params.addValue("dynamic_tsquery", filter.search().trim());
}
clauses.add(createDynamicValuelessNodeFilter(filter.search()));
String superQuery = getSuperQuery(pageable, clauses, params);

return new QueryParamPair(superQuery, params);
}

private String createDynamicValuelessNodeFilter(String search) {
String rankQuery = "0 as rank";
String rankWhere = "";
if (StringUtils.hasLength(search)) {
rankQuery = "ts_rank(searchable_fields, to_tsquery(:dynamic_tsquery)) AS rank";
rankWhere = "concept_node.searchable_fields @@ to_tsquery(:dynamic_tsquery) AND";
}
return """
SELECT
concept_node.concept_node_id,
%s
FROM
concept_node
LEFT JOIN dataset ON concept_node.dataset_id = dataset.dataset_id
LEFT JOIN concept_node_meta AS continuous_min ON concept_node.concept_node_id = continuous_min.concept_node_id AND continuous_min.KEY = 'min'
LEFT JOIN concept_node_meta AS continuous_max ON concept_node.concept_node_id = continuous_max.concept_node_id AND continuous_max.KEY = 'max'
LEFT JOIN concept_node_meta AS categorical_values ON concept_node.concept_node_id = categorical_values.concept_node_id AND categorical_values.KEY = 'values'
WHERE
%s
%s
(
continuous_min.value <> '' OR
continuous_max.value <> '' OR
categorical_values.value <> ''
)
"""
.formatted(rankQuery, rankWhere, "");
}

private static String getSuperQuery(Pageable pageable, List<String> clauses, MapSqlParameterSource params) {
String query = "(\n" + String.join("\n\tINTERSECT\n", clauses) + "\n)";
String superQuery = """
WITH q AS (
%s
)
%s
SELECT q.concept_node_id AS concept_node_id, max((1 + rank) * coalesce(rank_adjustment, 1)) AS rank
FROM q
LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id
GROUP BY q.concept_node_id
ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC
""".formatted(query, RANK_ADJUSTMENTS);
// explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC
// you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last
// coalesce will return the first non null value; this solves rows that aren't marked as filterable or not
// I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var
// is placed below a rank value of 0 for a filterable var
// Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird

if (pageable.isPaged()) {
superQuery = superQuery + """
LIMIT :limit
OFFSET :offset
""";
params.addValue("limit", pageable.getPageSize()).addValue("offset", pageable.getOffset());
}

superQuery = " concepts_filtered_sorted AS (\n" + superQuery + "\n)";
return superQuery;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,59 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import edu.harvard.dbmi.avillach.dictionary.filter.Filter;
import edu.harvard.dbmi.avillach.dictionary.filter.FilterProcessor;
import edu.harvard.dbmi.avillach.dictionary.legacysearch.model.LegacySearchQuery;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

@Component
public class LegacySearchQueryMapper {

private static final ObjectMapper objectMapper = new ObjectMapper();
private final FilterProcessor filterProcessor;
private static final Logger log = LoggerFactory.getLogger(LegacySearchQueryMapper.class);

public LegacySearchQueryMapper(FilterProcessor filterProcessor) {
this.filterProcessor = filterProcessor;
}
public LegacySearchQueryMapper() {}

public LegacySearchQuery mapFromJson(String jsonString) throws IOException {
JsonNode rootNode = objectMapper.readTree(jsonString);
JsonNode queryNode = rootNode.get("query");

String searchTerm = queryNode.get("searchTerm").asText();
int limit = queryNode.get("limit").asInt();
Filter filter = filterProcessor.processsFilter(new Filter(List.of(), searchTerm, List.of()));
return new LegacySearchQuery(filter, PageRequest.of(0, limit));
searchTerm = constructTsQuery(searchTerm);
log.debug("Constructed Search Term: {}", searchTerm);
return new LegacySearchQuery(new Filter(List.of(), searchTerm, List.of()), PageRequest.of(0, limit));
}

// An attempt to provide OR search that will produce similar results to legacy search-prototype
private String constructTsQuery(String searchTerm) {
// Split on the | to enable or queries
String[] orGroups = searchTerm.split("\\|");
List<String> orClauses = new ArrayList<>();

for (String group : orGroups) {
// To replicate legacy search we will split using its regex [\\s\\p{Punct}]+
String[] tokens = group.trim().split("[\\s\\p{Punct}]+");

// Now we will combine the tokens in this group and '&' them together.
String andClause = Arrays.stream(tokens).filter(token -> !token.isBlank()) // remove empty tokens.
.map(token -> token + ":*") // add the wild card for search
.collect(Collectors.joining(" & "));

if (!andClause.isBlank()) {
orClauses.add(andClause);
}
}


return String.join(" | ", orClauses);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public LegacySearchRepository(
}

public List<SearchResult> getLegacySearchResults(Filter filter, Pageable pageable) {
QueryParamPair filterQ = filterGen.generateFilterQuery(filter, pageable);
QueryParamPair filterQ = filterGen.generateLegacyFilterQuery(filter, pageable);
String sql = ALLOW_FILTERING_Q + ", " + filterQ.query() + """
SELECT concept_node.concept_path AS conceptPath,
concept_node.display AS display,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,38 @@ void shouldGetLegacyResponseByStudyID() throws IOException {
searchResults.forEach(searchResult -> Assertions.assertEquals("phs000007", searchResult.result().studyId()));
}

@Test
void shouldHandleORRequest() throws IOException {
String jsonString = """
{"query":{"searchTerm":"age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

ResponseEntity<LegacyResponse> legacyResponseResponseEntity = legacySearchController.legacySearch(jsonString);
Assertions.assertEquals(HttpStatus.OK, legacyResponseResponseEntity.getStatusCode());
LegacyResponse legacyResponseBody = legacyResponseResponseEntity.getBody();
Assertions.assertNotNull(legacyResponseBody);
Results results = legacyResponseBody.results();
List<SearchResult> ageSearchResults = results.searchResults();
Assertions.assertEquals(4, ageSearchResults.size());

jsonString = """
{"query":{"searchTerm":"physical|age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

legacyResponseResponseEntity = legacySearchController.legacySearch(jsonString);
Assertions.assertEquals(HttpStatus.OK, legacyResponseResponseEntity.getStatusCode());
legacyResponseBody = legacyResponseResponseEntity.getBody();
Assertions.assertNotNull(legacyResponseBody);
results = legacyResponseBody.results();
List<SearchResult> physicalORAgeSearchResults = results.searchResults();
Assertions.assertEquals(5, physicalORAgeSearchResults.size());

// Verify that age|physical has expanded the search results
Assertions.assertNotEquals(ageSearchResults.size(), physicalORAgeSearchResults.size());

// Verify the OR statement has more results
Assertions.assertTrue(ageSearchResults.size() < physicalORAgeSearchResults.size());
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ void shouldParseSearchRequest() throws IOException {
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("age", filter.search());
Assertions.assertEquals("age:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

@Test
void shouldReplaceUnderscore() throws IOException {
void shouldHandlePunct() throws IOException {
String jsonString =
"""
{"query":{"searchTerm":"tutorial-biolincc_digitalis","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
Expand All @@ -43,7 +43,35 @@ void shouldReplaceUnderscore() throws IOException {
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("tutorial-biolincc/digitalis", filter.search());
Assertions.assertEquals("tutorial:* & biolincc:* & digitalis:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

@Test
void shouldHandleOR() throws IOException {
String jsonString = """
{"query":{"searchTerm":"sex|gender","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

LegacySearchQuery legacySearchQuery = legacySearchQueryMapper.mapFromJson(jsonString);
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("sex:* | gender:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

@Test
void shouldHandleORAndPunct() throws IOException {
String jsonString = """
{"query":{"searchTerm":"sex|gender age","includedTags":[],"excludedTags":[],"returnTags":"true","offset":0,"limit":100}}
""";

LegacySearchQuery legacySearchQuery = legacySearchQueryMapper.mapFromJson(jsonString);
Filter filter = legacySearchQuery.filter();
Pageable pageable = legacySearchQuery.pageable();

Assertions.assertEquals("sex:* | gender:* & age:*", filter.search());
Assertions.assertEquals(100, pageable.getPageSize());
}

Expand Down