From eeff1d73e403c2a9ec627d383760832e3a53a57b Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Mon, 8 Mar 2021 18:36:52 -0500 Subject: [PATCH] Fix for affiliations and synonym searches --- .../builders/AqpAdsabsSubQueryProvider.java | 32 ++++++- .../AqpAdsabsAnalyzerProcessor.java | 13 +-- .../AqpAdsabsRegexNodeProcessor.java | 20 ++++- .../processors/AqpFuzzyModifierProcessor.java | 0 .../TestAdsabsTypeAffiliationTokens.java | 83 ++++++++++++++++++- .../server/solr/collection1/conf/schema.xml | 5 +- .../solr/collection1/conf/solrconfig.xml | 5 +- 7 files changed, 141 insertions(+), 17 deletions(-) rename contrib/{antlrqueryparser => adsabs}/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpFuzzyModifierProcessor.java (100%) diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java index f7a782e11..de2546473 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java @@ -46,6 +46,7 @@ import org.apache.lucene.search.SecondOrderQuery; import org.apache.lucene.search.SimpleCollector; import org.apache.lucene.search.SolrCacheWrapper; +import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; @@ -383,7 +384,8 @@ public Query parse(FunctionQParser fp) throws SyntaxError { int positionIncrementGap = 1; if (fp.getReq() != null) { IndexSchema schema = fp.getReq().getSchema(); - SchemaField field = schema.getFieldOrNull(query.toString().split(":")[0]); + String f = getField(query); + SchemaField field = schema.getFieldOrNull(f); if (field != null) { FieldType fType = field.getType(); //if (!fType.isMultiValued()) { @@ -423,6 +425,34 @@ public Query parse(FunctionQParser fp) throws SyntaxError { query = new BoostQuery(query, boostFactor); return query; } + + private String getField(Query query) throws SyntaxError { + + if (query instanceof TermQuery) { + return ((TermQuery) query).getTerm().field(); + } + else if (query instanceof SynonymQuery) { + for (Term t: ((SynonymQuery) query).getTerms()) { + return t.field(); + } + } + else if (query instanceof BooleanQuery) { + HashSet s = new HashSet(); + for (BooleanClause c: ((BooleanQuery) query).clauses()) { + s.add(getField(c.getQuery())); + } + if (s.size() > 1) { + throw new SyntaxError("Illegal state: combining multiple fields is not allowed"); + } + return (String) s.toArray()[0]; + } + else { + // last resort + return query.toString().split(":")[0]; + } + + return null; + } }); /* @api.doc diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsAnalyzerProcessor.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsAnalyzerProcessor.java index ab38dc1b7..bc3d9e899 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsAnalyzerProcessor.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsAnalyzerProcessor.java @@ -8,7 +8,6 @@ import org.apache.lucene.queryparser.flexible.aqp.builders.AqpFunctionQueryBuilder; import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; -import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpANTLRNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpFunctionQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode; import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler; @@ -19,6 +18,7 @@ import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.messages.MessageImpl; +import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAnalyzerQueryNodeProcessor; /** * This processor prevents analysis to happen for nodes that are @@ -118,16 +118,16 @@ protected QueryNode postProcessNode(QueryNode node) return rn; } - - private Map getConfigMap() { + protected static Map empty = new HashMap(); + public Map getConfigMap() { Map args = getQueryConfigHandler().get( AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER); if (args == null) - return new HashMap(); + return empty; return args; } - private String getConfigVal(String key) { + public String getConfigVal(String key) { Map args = getConfigMap(); if (args.containsKey(key)) { return args.get(key); @@ -135,7 +135,7 @@ private String getConfigVal(String key) { return null; } - private String getConfigVal(String key, String defaultVal) { + public String getConfigVal(String key, String defaultVal) { Map args = getConfigMap(); if (args.containsKey(key)) { return args.get(key); @@ -143,4 +143,5 @@ private String getConfigVal(String key, String defaultVal) { return defaultVal; } + } diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsRegexNodeProcessor.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsRegexNodeProcessor.java index 646545fd1..6fd8c24e1 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsRegexNodeProcessor.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsRegexNodeProcessor.java @@ -16,7 +16,7 @@ import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsRegexQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode; -public class AqpAdsabsRegexNodeProcessor extends QueryNodeProcessorImpl implements +public class AqpAdsabsRegexNodeProcessor extends AqpQueryNodeProcessorImpl implements QueryNodeProcessor { @Override @@ -33,6 +33,10 @@ protected QueryNode preProcessNode(QueryNode node) FieldQueryNode n = (FieldQueryNode) node; String input = n.getTextAsString(); + String field = n.getFieldAsString(); + if (!isRegexField(field)) + return node; + if (input == null) { return node; } @@ -62,7 +66,19 @@ protected QueryNode preProcessNode(QueryNode node) return node; } - @Override + // can be overriden to plug in some configuration + protected boolean isRegexField(String field) { + String val = getConfigVal("aqp.regex.disallowed.fields", null); + if (val == null) + return true; + for (String f: val.split(";")) { + if (f.equals(field)) + return false; + } + return true; + } + + @Override protected List setChildrenOrder(List children) throws QueryNodeException { return children; diff --git a/contrib/antlrqueryparser/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpFuzzyModifierProcessor.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpFuzzyModifierProcessor.java similarity index 100% rename from contrib/antlrqueryparser/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpFuzzyModifierProcessor.java rename to contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpFuzzyModifierProcessor.java diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java index c202b254d..cbdd4a3c9 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java @@ -78,6 +78,9 @@ public static String getSchemaFile() { + "ror.1;foo;bar\n" + "A00001;Aalborg U;Aalborg University;RID1004;04m5j1k67;000000010742471X;Q601956;grid.5117.2;\n\n" + "A00002;Aarhus U;Aarhus University;RID1006;01aj84f44;0000000119562722;Q924265;grid.7048.b;\n" + + "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n" + + "AX;SI\n" + + "AB=>CfA\n" }); replaceInFile(newConfig, "synonyms=\"aff_id.synonyms\"", @@ -99,6 +102,16 @@ public void test() throws Exception { "institution", "U Catania/Dep Phy Ast; -; -; INFN/Catania", "institution", "U Catania/Dep Phy Ast; -" )); + assertU(addDocs( + "institution", "SI/CfA; Harvard U/CfA", + "institution", "Harvard U/Phys; Brown U/Ast", + "aff", "SI/CfA") + ); + assertU(addDocs( + "institution", "Harvard U/Law; -", + "institution", "SI/CfA; Harvard U/CfA", + "aff", "SI/CfA") + ); assertU(commit()); assertQueryEquals(req("q", "aff_id:https\\://ror.org/ror.1"), @@ -193,7 +206,7 @@ public void test() throws Exception { assertQ(req("q", "pos(institution:\"U Catania/Dep Phy Ast\", 1)"), "//*[@numFound='1']"); assertQ(req("q", "pos(institution:\"U Catania\", 1)"), "//*[@numFound='1']"); assertQ(req("q", "pos(institution:\"Dep Phy Ast\", 1)"), "//*[@numFound='1']"); - assertQ(req("q", "pos(institution:\"-\", 1)"), "//*[@numFound='1']"); + assertQ(req("q", "pos(institution:\"-\", 1)"), "//*[@numFound='2']"); assertQ(req("q", "pos(institution:\"INFN/Catania\", 1)"), "//*[@numFound='0']"); assertQ(req("q", "pos(institution:\"INFN\", 1)"), "//*[@numFound='0']"); assertQ(req("q", "pos(institution:\"Catania\", 1)"), "//*[@numFound='0']"); @@ -201,6 +214,74 @@ public void test() throws Exception { assertQ(req("q", "pos(institution:\"INFN/Catania\", 2)"), "//*[@numFound='1']"); assertQ(req("q", "pos(institution:\"INFN\", 2)"), "//*[@numFound='1']"); assertQ(req("q", "pos(institution:\"Catania\", 2)"), "//*[@numFound='1']"); + + // search parts of the affiliation + assertQ(req("q", "institution:\"SI\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"CfA\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"Harvard U\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"Law\""), "//*[@numFound='1']"); + assertQ(req("q", "institution:\"Phys\""), "//*[@numFound='1']"); + + // search parts (but honour position) + assertQ(req("q", "pos(institution:\"SI\", 1)"), "//*[@numFound='1']"); + assertQ(req("q", "pos(institution:\"CfA\", 1)"), "//*[@numFound='1']"); + assertQ(req("q", "pos(institution:\"Harvard U\", 2)"), "//*[@numFound='2']"); + assertQ(req("q", "pos(institution:\"CfA\", 2)"), "//*[@numFound='1']"); + + // search parent/child + assertQ(req("q", "institution:\"SI/CfA\""), "//*[@numFound='2']"); + + // do the same but as phrase; it should fail because the parser WILL NOT + // treat empty space as a delimiter; it considers it part of the token + // like 'Harvard U' + assertQ(req("q", "institution:\"SI CfA\""), "//*[@numFound='0']"); + + // proximity operator however should yield the record + assertQ(req("q", "institution:\"SI\" NEAR1 institution:\"CfA\""), "//*[@numFound='2']"); + + // but not mix up insitutions that were separated by ';' (those affiliations + // belong to different authors/persons) + assertQ(req("q", "institution:\"Law\" NEAR5 institution:\"CfA\""), "//*[@numFound='0']"); + + // one person however can have multiple affiliations; and they can be searched via proximity + assertQ(req("q", "institution:\"Phys\" NEAR5 institution:\"Ast\""), "//*[@numFound='1']"); + + // SI/CfA is also known through identifiers/canonical names - NOTE: the input synonyms + // MUST CONTAIN correct entry, i.e. "SI/CfA" - for some reason we used to have "SI CfA" + assertQ(req("q", "institution:\"A01400\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"RID61814\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"Harvard Smithsonian Center for Astrophysics\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"03c3r2d17\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"Q1133697\""), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"grid.455754.2\""), "//*[@numFound='2']"); + + + // what is the meaning of the pipe? (|) -- it forces our parser to treat the query + // as a regex; to not do that we have to set aqp.regex.disallowed.fields + assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"", + "aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']"); + + // and we also want to find the records via parent/child relationship BUT using + // synonyms; so assume that parent (SI) is also known under synonym 'AX' and + // CfA is known under synonym 'AB'; the search "AX/AB" should then find the same + // thing as "SI/CfA" -- HOWEVER, note, this feature requires synonym mapping + // either of the explicit form: + // AX => SI + // or more forgiving (and more wasteful): + // AX;SI + // IMHO this feature is confusing; will bloat the synonym file and users + // are going to be confused by it. I'd just say: use canonical forms of + // the synonym. And, and... be aware that if the synonym file contains a cycle + // i.e. 'parent/child' entry sharing synonyms with 'child' entry; then the + // two will be merged and considered as one: + // + // SI/CfA;A01400 + // CfA;A014000 + // + // becomes: + // SI/CfA;A01400;CfA + assertQ(req("q", "institution:\"AX/AB\""), "//*[@numFound='2']"); + } diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index 483d7821d..22bb10b0e 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -892,6 +892,7 @@ types="wdafftypes.txt" /> + + @@ -1292,9 +1294,6 @@ docValues="true"/> - - ${solr.ramBufferSize:1000} ${solr.maxBufferedDocs:50000} - - ${solr.mergePolicy.maxMergeAtOnce:10} - ${solr.mergePolicy.segmentsPerTier:10} - +