From c0abd7db7bd27aa55cc178ea6c188c1338cd9682 Mon Sep 17 00:00:00 2001 From: Luke Sikina Date: Tue, 8 Oct 2024 12:39:39 -0400 Subject: [PATCH] [ALS-7452] Sort unfilterable vars to EOL --- .../concept/ConceptFilterQueryGenerator.java | 42 ++++++++++++++++-- .../concept/ConceptRepositoryTest.java | 16 +++---- .../ConceptFilterQueryGeneratorTest.java | 11 +++++ src/test/resources/seed.sql | 44 +++++++++---------- 4 files changed, 79 insertions(+), 34 deletions(-) diff --git a/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java b/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java index c86c48c..e492736 100644 --- a/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java +++ b/src/main/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptFilterQueryGenerator.java @@ -3,6 +3,8 @@ import edu.harvard.dbmi.avillach.dictionary.facet.Facet; import edu.harvard.dbmi.avillach.dictionary.filter.Filter; import edu.harvard.dbmi.avillach.dictionary.filter.QueryParamPair; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; import org.springframework.data.domain.Pageable; import org.springframework.jdbc.core.namedparam.MapSqlParameterSource; import org.springframework.stereotype.Component; @@ -14,6 +16,22 @@ @Component public class ConceptFilterQueryGenerator { + private final List disallowedMetaFields; + + private static final String RANK_ADJUSTMENTS = """ + , allow_filtering AS ( + SELECT + concept_node.concept_node_id AS concept_node_id, + (string_agg(concept_node_meta.value, ' ') NOT LIKE '%' || 'true' || '%')::int AS rank_adjustment + FROM + concept_node + JOIN concept_node_meta ON + concept_node.concept_node_id = concept_node_meta.concept_node_id + AND concept_node_meta.KEY IN (:disallowed_meta_keys) + GROUP BY + concept_node.concept_node_id + ) + """; private static final String CONSENT_QUERY = """ dataset.dataset_id IN ( @@ -36,6 +54,13 @@ dataset.dataset_id IN ( ) AND """; + @Autowired + public ConceptFilterQueryGenerator( + @Value("${filtering.unfilterable_concepts}") List disallowedMetaFields + ) { + this.disallowedMetaFields = disallowedMetaFields; + } + /** * This generates a query that will return a list of concept_node IDs for the given filter. *

@@ -50,6 +75,7 @@ dataset.dataset_id IN ( */ public QueryParamPair generateFilterQuery(Filter filter, Pageable pageable) { MapSqlParameterSource params = new MapSqlParameterSource(); + params.addValue("disallowed_meta_keys", disallowedMetaFields); List clauses = new java.util.ArrayList<>(List.of()); if (!CollectionUtils.isEmpty(filter.facets())) { clauses.addAll(createFacetFilter(filter, params)); @@ -68,11 +94,19 @@ public QueryParamPair generateFilterQuery(Filter filter, Pageable pageable) { WITH q AS ( %s ) - SELECT concept_node_id + %s + SELECT q.concept_node_id AS concept_node_id FROM q - GROUP BY concept_node_id - ORDER BY max(rank) DESC - """.formatted(query); + LEFT JOIN allow_filtering ON allow_filtering.concept_node_id = q.concept_node_id + GROUP BY q.concept_node_id + ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC, q.concept_node_id ASC + """.formatted(query, RANK_ADJUSTMENTS); + // explanation of ORDER BY max((1 + rank) * coalesce(rank_adjustment, 1)) DESC + // you want to sort the best matches first, BUT anything that is marked as unfilterable should be put last + // coalesce will return the first non null value; this solves rows that aren't marked as filterable or not + // I then multiply that by 1 + rank instead of just rank so that a rank value of 0 for an unfilterable var + // is placed below a rank value of 0 for a filterable var + // Finally, I add the concept node id to the sort to keep it stable for ties, otherwise pagination gets weird if (pageable.isPaged()) { superQuery = superQuery + """ diff --git a/src/test/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptRepositoryTest.java b/src/test/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptRepositoryTest.java index 9a51ac4..bca46b3 100644 --- a/src/test/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptRepositoryTest.java +++ b/src/test/java/edu/harvard/dbmi/avillach/dictionary/concept/ConceptRepositoryTest.java @@ -55,23 +55,23 @@ void shouldListAllConcepts() { @Test void shouldListFirstTwoConcepts() { List actual = subject.getConcepts(new Filter(List.of(), "", List.of()), Pageable.ofSize(2).first()); - List expected = List.of( - new ContinuousConcept("\\phs000007\\pht000021\\phv00003844\\FL200\\", "phv00003844", "FL200", "phs000007", "# 12 OZ CUPS OF CAFFEINATED COLA / DAY", true, 0F, 3F, "FHS", null), - new CategoricalConcept("\\Variant Data Type\\Low coverage WGS\\", "Low coverage WGS", "Low coverage WGS", "1", "Low coverage WGS", List.of("TRUE"), true, "GIC", null, null) + List expectedPaths = List.of( + "\\ACT Diagnosis ICD-10\\J00-J99 Diseases of the respiratory system (J00-J99)\\J40-J47 Chronic lower respiratory diseases (J40-J47)\\J45 Asthma\\J45.5 Severe persistent asthma\\J45.52 Severe persistent asthma with status asthmaticus\\", + "\\ACT Diagnosis ICD-10\\J00-J99 Diseases of the respiratory system (J00-J99)\\J40-J47 Chronic lower respiratory diseases (J40-J47)\\J45 Asthma\\J45.9 Other and unspecified asthma\\J45.90 Unspecified asthma\\J45.901 Unspecified asthma with (acute) exacerbation\\" ); - Assertions.assertEquals(expected, actual); + Assertions.assertEquals(expectedPaths, actual.stream().map(Concept::conceptPath).toList()); } @Test void shouldListNextTwoConcepts() { List actual = subject.getConcepts(new Filter(List.of(), "", List.of()), Pageable.ofSize(2).first().next()); - List expected = List.of( - new CategoricalConcept("\\phs002385\\RACEG\\", "RACEG", "RACEG", "phs002385", "Race (regrouped)", List.of("Not Reported"), true, "HCT_for_SCD", null, null), - new CategoricalConcept("\\Variant Data Type\\Low coverage WGS\\", "Low coverage WGS", "Low coverage WGS", "1", "Low coverage WGS", List.of("TRUE"), true, "GIC", null, null) + List expectedPaths = List.of( + "\\ACT Diagnosis ICD-10\\J00-J99 Diseases of the respiratory system (J00-J99)\\J40-J47 Chronic lower respiratory diseases (J40-J47)\\J45 Asthma\\J45.9 Other and unspecified asthma\\J45.90 Unspecified asthma\\J45.902 Unspecified asthma with status asthmaticus\\", + "\\Bio Specimens\\HumanFluid\\Blood (Whole)\\SPECIMENS:HF.BLD.000 Quantity\\" ); - Assertions.assertEquals(expected, actual); + Assertions.assertEquals(expectedPaths, actual.stream().map(Concept::conceptPath).toList()); } @Test diff --git a/src/test/java/edu/harvard/dbmi/avillach/dictionary/filter/ConceptFilterQueryGeneratorTest.java b/src/test/java/edu/harvard/dbmi/avillach/dictionary/filter/ConceptFilterQueryGeneratorTest.java index e0d73fc..2a66600 100644 --- a/src/test/java/edu/harvard/dbmi/avillach/dictionary/filter/ConceptFilterQueryGeneratorTest.java +++ b/src/test/java/edu/harvard/dbmi/avillach/dictionary/filter/ConceptFilterQueryGeneratorTest.java @@ -44,6 +44,17 @@ static void mySQLProperties(DynamicPropertyRegistry registry) { @Autowired NamedParameterJdbcTemplate template; + @Test + void shouldPutStigvarsLastForEmptySearch() { + Filter filter = new Filter(List.of(), "", List.of()); + QueryParamPair pair = subject.generateFilterQuery(filter, Pageable.unpaged()); + String query = "WITH " + pair.query() + "\n SELECT * FROM concepts_filtered_sorted;"; + + List actual = template.queryForList(query, pair.params(), Integer.class); + + Assertions.assertEquals(246, actual.getLast()); + } + @Test void shouldGenerateForHarmonizedConsents() { Filter filter = new Filter(List.of(), "", List.of("phs001963.c1")); diff --git a/src/test/resources/seed.sql b/src/test/resources/seed.sql index 37fa98f..9acff32 100644 --- a/src/test/resources/seed.sql +++ b/src/test/resources/seed.sql @@ -405,24 +405,24 @@ COPY public.concept_node (concept_node_id, dataset_id, name, display, concept_ty COPY public.concept_node_meta (concept_node_meta_id, concept_node_id, key, value) FROM stdin; 19 186 description Approximate Synonyms:\nSevere persistent allergic asthma in status asthmaticus\nSevere persistent allergic asthma with status asthmaticus\nSevere persistent asthma in status asthmaticus\nSevere persistent asthma with allergic rhinitis in status asthmaticus\nSevere persistent asthma with allergic rhinitis with status asthmaticus -20 186 values J45.52 Severe persistent asthma with status asthmaticus +20 186 values ["J45.52 Severe persistent asthma with status asthmaticus"] 21 189 description Approximate Synonyms:\nAcute exacerbation of asthma with allergic rhinitis\nAllergic asthma with acute exacerbation\nAsthma, with acute exacerbation (flare-up)\nAsthma, with allergic rhinitis with acute exacerbation\nExacerbation of asthma -22 189 values J45.901 Unspecified asthma with (acute) exacerbation +22 189 values ["J45.901 Unspecified asthma with (acute) exacerbation"] 23 190 description Approximate Synonyms:\nAsthma with allergic rhinitis in status asthmaticus\nAsthma with allergic rhinitis with status asthmaticus\nAsthma with status\nAsthma with status asthmaticus\nAsthma, allergic with status asthmaticus\nExtrinsic asthma with status asthmaticus -24 190 values J45.902 Unspecified asthma with status asthmaticus +24 190 values ["J45.902 Unspecified asthma with status asthmaticus"] 25 211 description GIC biosample: wholeblood 26 211 data_source Biosample -27 212 values TRUE -28 214 values TRUE +27 212 values ["TRUE"] +28 214 values ["TRUE"] 29 216 description Those patients who align with the IRB Phase 2 protocols -30 216 values GIC Consent +30 216 values ["GIC Consent"] 31 217 description Those patients who DO NOT align with the IRB Phase 2 protocols -32 217 values GIC Legacy Consent +32 217 values ["GIC Legacy Consent"] 33 218 description Patients who have waived consent -34 218 values Waiver of Consent +34 218 values ["Waiver of Consent"] 35 222 description Heart rate is taken by the automated blood pressure/heart rate monitor and captured directly into the computer system. In the event the heart rate is not captured automatically at the end of stage 1, the technician would manually enter the readings from the heart rate monitor. 37 225 description Including living and deceased, were any of {SP's/your} close biological that is, blood relatives including father, mother, sisters or brothers, ever told by a health professional that they had a heart attack or angina (an-gi-na) before the age of 50? -38 225 values Yes +38 225 values ["Yes"] 39 229 description # 12 OZ CUPS OF CAFFEINATED COLA / DAY 41 229 stigmatized false 42 229 unique_identifier false @@ -439,7 +439,7 @@ COPY public.concept_node_meta (concept_node_meta_id, concept_node_id, key, value 55 235 free_text false 56 235 bdc_open_access true 57 239 description Most recent occupation (A) -58 239 values ACCOUNTANT +58 239 values ["ACCOUNTANT"] 59 241 description Age 61 241 stigmatized false 62 241 unique_identifier false @@ -451,7 +451,7 @@ COPY public.concept_node_meta (concept_node_meta_id, concept_node_id, key, value 69 242 free_text false 70 242 bdc_open_access true 71 244 description Patient age at transplant, years -72 244 values 42 +72 244 values [42] 73 244 stigmatized false 74 244 unique_identifier false 75 244 free_text false @@ -459,13 +459,13 @@ COPY public.concept_node_meta (concept_node_meta_id, concept_node_id, key, value 77 244 hct status pre-hct 78 244 computed variable yes 79 245 description Race (regrouped) -80 245 values Not Reported +80 245 values ["Not Reported"] 81 245 stigmatized false 82 245 unique_identifier false 83 245 free_text false 84 245 bdc_open_access true 85 246 description Transplant Number -86 246 values 1 +86 246 values [1] 87 246 stigmatized true 88 246 unique_identifier false 89 246 free_text false @@ -473,44 +473,44 @@ COPY public.concept_node_meta (concept_node_meta_id, concept_node_id, key, value 91 246 hct status pre-hct 92 246 computed variable yes 93 248 description Participant's age (category) -94 248 values 21 +94 248 values [21] 95 248 stigmatized false 96 248 unique_identifier false 97 248 free_text false 98 248 bdc_open_access true 99 249 description Smoker status -100 249 values true +100 249 values ["true"] 101 249 stigmatized false 102 249 unique_identifier false 103 249 free_text false 104 249 bdc_open_access true 105 253 description (AFC) Reason for ending future contact: Withdrew consent for future nuMoM2b contact -106 253 values No +106 253 values ["No"] 107 253 unique_identifier false 108 253 free_text false 109 253 bdc_open_access true 110 255 description (V5A) Which of the following problems have a doctor or health care professional told you that you have with your kidney?: Other - Specify Field -111 255 values infection +111 255 values ["infection"] 112 255 unique_identifier false 113 255 free_text true 114 255 bdc_open_access false 115 258 description (T01) Are you currently prescribed medication for your high blood pressure? -116 258 values Yes +116 258 values ["Yes"] 117 258 unique_identifier false 118 258 free_text false 119 258 bdc_open_access true 120 266 description Genotype array -121 266 values TRUE +121 266 values ["TRUE"] 122 267 description Low coverage WGS -123 267 values TRUE +123 267 values ["TRUE"] 124 268 description Whole exome sequencing 40 229 values [0, 3] 46 232 values [0, 1] 52 235 values [0.57,6.77] 60 241 values ["5E-21", "7E+33"] -125 268 values TRUE +125 268 values ["TRUE"] 126 269 description Whole genome sequencing -127 269 values TRUE +127 269 values ["TRUE"] 128 227 description Clinic Exam, Original Cohort Exam 19 129 230 description Clinic Exam, Original Cohort Exam 20 130 233 description Clinic Exam, Offspring Cohort Exam 4