diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpSlopQueryNodeBuilder.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpSlopQueryNodeBuilder.java index 0158c0904..3ee5a1c2b 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpSlopQueryNodeBuilder.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpSlopQueryNodeBuilder.java @@ -1,6 +1,7 @@ package org.apache.lucene.queryparser.flexible.aqp.builders; import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAnalyzerQueryNodeProcessor; import org.apache.lucene.queryparser.flexible.aqp.processors.AqpPostAnalysisProcessor; @@ -23,13 +24,13 @@ import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.core.builders.QueryTreeBuilder; -import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.SlopQueryNode; import org.apache.lucene.queryparser.flexible.standard.builders.StandardQueryBuilder; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PhraseQuery.Builder; +import org.apache.solr.request.SolrQueryRequest; import org.apache.lucene.search.Query; /** diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java index 04f27a6f8..7ec63957c 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java @@ -20,7 +20,6 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -29,11 +28,9 @@ import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler; import org.apache.lucene.queryparser.flexible.aqp.config.AqpRequestParams; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsScoringQueryNode; -import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpFunctionQueryNode; import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpOrQueryNode; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; -import org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder; -import org.apache.lucene.queryparser.flexible.core.builders.QueryTreeBuilder; +import org.apache.lucene.queryparser.flexible.core.config.FieldConfig; import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages; import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; @@ -44,10 +41,11 @@ import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode; import org.apache.lucene.queryparser.flexible.standard.processors.MultiTermRewriteMethodProcessor; import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.TermQuery; import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.common.params.SolrParams; +import org.apache.solr.handler.AdsConfigHandler; import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.IndexSchema; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.RefCounted; @@ -56,6 +54,7 @@ public class AqpChangeRewriteMethodProcessor extends boolean first = true; private Set types = null; private Set fields = null; + private Set ignoredFields = null; protected QueryNode preProcessNode(QueryNode node) throws QueryNodeException { @@ -111,6 +110,37 @@ else if (node instanceof MultiPhraseQueryNode) { } } + if (getConfigVal("aqp.multiphrase.keep_one.ignore.fields", null) != null) { + + if (ignoredFields == null) { + ignoredFields = new HashSet(); + for (String s: getConfigVal("aqp.multiphrase.keep_one.ignore.fields").split(",")) { + ignoredFields.add(s); + } + } + + if (ignoredFields.contains((String)((MultiPhraseQueryNode) node).getField())) { + + // for ignored fields, we don't want to do proximity search + for (QueryNode child: node.getChildren()) { + child.setTag(AqpAnalyzerQueryNodeProcessor.MAX_MULTI_TOKEN_SIZE, 0); + } + + return node; + } + } + + + AqpRequestParams reqAttr = this.getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST); + if (reqAttr != null) { + IndexSchema schema = reqAttr.getRequest().getSchema(); + FieldType fType = schema.getFieldType((String)((MultiPhraseQueryNode) node).getField()); + if (fType != null) { + node.setTag("field.is.tokenized", fType.isTokenized()); + } + } + + try { node = simplifyMultiphrase(node, types); } catch (IOException e) { diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java index cbdd4a3c9..4ab5d6123 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java @@ -25,6 +25,7 @@ import java.io.IOException; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.SynonymQuery; import org.apache.lucene.search.TermQuery; @@ -78,9 +79,11 @@ public static String getSchemaFile() { + "ror.1;foo;bar\n" + "A00001;Aalborg U;Aalborg University;RID1004;04m5j1k67;000000010742471X;Q601956;grid.5117.2;\n\n" + "A00002;Aarhus U;Aarhus University;RID1006;01aj84f44;0000000119562722;Q924265;grid.7048.b;\n" - + "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n" + //+ "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n" + "AX;SI\n" + "AB=>CfA\n" + + "A01400;CfA;SI/CfA;Harvard U/CfA;Center for Astrophysics Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n" + + "A01397;SI;Smithsonian Institution;RID8264;01pp8nd67;0000000087163312;Q131626;grid.1214.6" }); replaceInFile(newConfig, "synonyms=\"aff_id.synonyms\"", @@ -258,9 +261,12 @@ public void test() throws Exception { // what is the meaning of the pipe? (|) -- it forces our parser to treat the query // as a regex; to not do that we have to set aqp.regex.disallowed.fields - assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"", - "aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']"); + //assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"", + // "aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']"); + assertQ(req("q", "institution:\"Center for Astrophysics Harvard and Smithsonian\"", + "aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']"); + // and we also want to find the records via parent/child relationship BUT using // synonyms; so assume that parent (SI) is also known under synonym 'AX' and // CfA is known under synonym 'AB'; the search "AX/AB" should then find the same @@ -282,6 +288,28 @@ public void test() throws Exception { // SI/CfA;A01400;CfA assertQ(req("q", "institution:\"AX/AB\""), "//*[@numFound='2']"); + //this tests behaviour with ADS's extended configuration for multi-token synonym handling + //first what happens what we are doing by default; then with the configuration to disable + //such treatment for specific fields + assertQueryEquals(req("q", "institution:\"SI/CfA\"", + "aqp.multiphrase.keep_one", "SYNONYM" + ), + "institution:\"si cfa\"~6", + MultiPhraseQuery.class + ); + + assertQueryEquals(req("q", "institution:\"SI/CfA\"", + "aqp.multiphrase.keep_one", "SYNONYM", + "aqp.multiphrase.keep_one.ignore.fields", "aff_id,aff_raw,institution"), + "institution:\"(ax si a01397 smithsonian institution rid8264 01pp8nd67 0000000087163312 q131626 grid.1214.6) (a01400 cfa si/cfa harvard u/cfa center for astrophysics harvard and smithsonian harvard smithsonian center for astrophysics rid61814 03c3r2d17 q1133697 grid.455754.2)\"", + MultiPhraseQuery.class + ); + // and check we still retrieve the same docs + assertQ(req("q", "institution:\"SI/CfA\"", + "aqp.multiphrase.keep_one", "SYNONYM", + "aqp.multiphrase.keep_one.ignore.fields", "aff_id,aff_raw,institution"), + "//*[@numFound='2']"); + } diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index 24ffe8fb7..2cd0140a6 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -892,6 +892,7 @@ types="wdafftypes.txt" /> + 0.5 first_author^14 author^13 year^10 bibstem^10 SYNONYM + aff_raw,aff_id,institution AND @@ -370,6 +371,7 @@ 0.5 first_author^14 author^13 year^10 bibstem^10 SYNONYM + aff_raw,aff_id,institution AND unfielded_search