From 78efe39913c97152fe3caf8ab9920b5c2f54433d Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Tue, 30 Nov 2021 19:28:09 -0500 Subject: [PATCH] Fixed #issues/173 - failing proximity search due to a constant search; Made author facet searches case insenstive --- .../builders/AqpAdsabsSubQueryProvider.java | 16 ++-- .../PythonicAuthorNormalizerFilter.java | 2 - .../src/test/org/adsabs/TestAdsAllFields.java | 86 ++++++++++--------- .../flexible/aqp/TestAqpAdsabs.java | 6 +- .../lucene/search/TestCitationsSearch.java | 2 +- .../analysis/TestAdsabsTypeDateString.java | 3 +- .../solr/search/TestAqpAdsabsSolrSearch.java | 21 ++++- .../flexible/aqp/builders/SpanConverter.java | 48 +++++++---- .../server/solr/collection1/conf/schema.xml | 6 +- .../solr/collection1/conf/solrconfig.xml | 5 +- ...BlackBoxAdslabsDeploymentVerification.java | 8 +- 11 files changed, 117 insertions(+), 86 deletions(-) diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java index de2546473..82ac97004 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java @@ -975,11 +975,11 @@ public Query parse() throws SyntaxError { parsers.put("edismax_combined_aqp", new AqpSubqueryParserFull() { // will decide whether new aqp() parse is needed public Query parse(FunctionQParser fp) throws SyntaxError { final String original = fp.getString(); - //System.out.println("edismax fed: " + original); - QParser eqp = fp.subQuery(original, "adismax"); - Query q = eqp.getQuery(); - //System.out.println("edismax produced: " + q); - return simplify(q); + //System.out.println("edismax fed: " + original); + QParser eqp = fp.subQuery(original, "adismax"); + Query q = eqp.getQuery(); + //System.out.println("edismax produced: " + q); + return simplify(q); } protected Query swimDeep(DisjunctionMaxQuery query) throws SyntaxError { List parts = query.getDisjuncts(); @@ -1019,7 +1019,7 @@ private String toBeAnalyzedAgain(TermQuery q) { } private Query reAnalyze(String field, String value, Float boost) throws SyntaxError { QParser fParser = getParser(); - System.out.println(field+ ":"+fParser.getString() + "|value=" + value); + //System.out.println(field+ ":"+fParser.getString() + "|value=" + value); QParser aqp = fParser.subQuery(field+ ":"+fParser.getString(), "aqp"); Query q = aqp.getQuery(); if (boost != null && boost != 1.0f) { @@ -1069,8 +1069,8 @@ private Query reAnalyze(String field, String value, Float boost) throws SyntaxEr QParser aqp = fParser.subQuery(field+ ":"+fParser.getString(), "aqp"); Query q = aqp.getQuery(); if (boost != null && boost != 1.0f) { - q = new BoostQuery(q, boost); - } + q = new BoostQuery(q, boost); + } return q; } }); diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/PythonicAuthorNormalizerFilter.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/PythonicAuthorNormalizerFilter.java index 25812e10a..24c66efb8 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/PythonicAuthorNormalizerFilter.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/PythonicAuthorNormalizerFilter.java @@ -10,8 +10,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; import org.jython.JythonObjectFactory; import org.jython.monty.interfaces.JythonNameParser; diff --git a/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java b/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java index c71f384e6..4e54af52e 100644 --- a/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java +++ b/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java @@ -431,24 +431,24 @@ public void test() throws Exception { "aqp.constant_scoring", "author^13 title^12", "aqp.classic_scoring.modifier", "0.48", "fl", "recid,score"), - "//*[@numFound='1']", - "//doc/int[@name='recid'][.='100']", - "//doc/float[@name='score'][.='13.0']" // 13.00 * (cite_read_boost + aqp.classic_scoring.modifier) - ); + "//*[@numFound='1']", + "//doc/int[@name='recid'][.='100']", + "//doc/float[@name='score'][.='13.0']" // 13.00 * (cite_read_boost + aqp.classic_scoring.modifier) + ); assertQ(req("q", "author:\"Einstein, A\" AND author:\"Anders\"", "aqp.constant_scoring", "author^13", "aqp.classic_scoring.modifier", "0.48", - "fl", "recid,score"), - "//*[@numFound='1']", - "//doc/int[@name='recid'][.='100']", - "//doc/float[@name='score'][.='26.0']"); + "fl", "recid,score"), + "//*[@numFound='1']", + "//doc/int[@name='recid'][.='100']", + "//doc/float[@name='score'][.='26.0']"); assertQ(req("q", "author:\"Einstein, A\" OR author:\"Anders\"", - "aqp.constant_scoring", "author^13", - "aqp.classic_scoring.modifier", "0.48", - "fl", "recid,score"), - "//*[@numFound='1']", - "//doc/int[@name='recid'][.='100']", - "//doc/float[@name='score'][.='26.0']"); + "aqp.constant_scoring", "author^13", + "aqp.classic_scoring.modifier", "0.48", + "fl", "recid,score"), + "//*[@numFound='1']", + "//doc/int[@name='recid'][.='100']", + "//doc/float[@name='score'][.='26.0']"); assert h.query(req("q", "author:\"Einstein, A\"", "fl", "author_norm", "indent", "false")) .contains("" + @@ -494,11 +494,13 @@ public void test() throws Exception { /* - * author facets + * author facets - should be case insensitive */ assertQ(req("q", "author_facet_hier:\"0/Anders, J M\""), "//*[@numFound='1']"); + assertQ(req("q", "author_facet_hier:\"0/anders, j m\""), "//*[@numFound='1']"); assertQ(req("q", "author_facet_hier:\"1/Anders, J M/Anders, John Michael\""), "//*[@numFound='1']"); + assertQ(req("q", "author_facet_hier:\"1/Anders, J M/ANDERS, john michael\""), "//*[@numFound='1']"); assertQ(req("q", "author_facet_hier:\"1/Einstein, A\""), "//*[@numFound='0']"); @@ -656,33 +658,33 @@ public void test() throws Exception { /* - * orcid, added 30/12/14; they must correspond to the author array - * - updated 13/11/15 - orcid field is now a virtual one; and we have - * orcid_pub,_user,_other - */ - assertQ(req("q", "orcid_pub:1111-2222-3333-4444"), - "//doc/int[@name='recid'][.='100']", - "//*[@numFound='1']" - ); - assertQ(req("q", "orcid_pub:1111*"), - "//doc/int[@name='recid'][.='100']", - "//*[@numFound='1']" - ); - assert h.query(req("q", "recid:100", "indent", "false", "fl", "orcid_pub")) - .contains("" + - "1111-2222-3333-4444" + - "-" + - "0000-0002-4110-3511" - ); - // this is only present in orcid_other - assertQ(req("q", "orcid:1111-2222-3333-5555"), - "//doc/int[@name='recid'][.='100']", - "//*[@numFound='1']" - ); - assertQ(req("q", "orcid_other:1111-2222-3333-5555"), - "//doc/int[@name='recid'][.='100']", - "//*[@numFound='1']" - ); + * orcid, added 30/12/14; they must correspond to the author array + * - updated 13/11/15 - orcid field is now a virtual one; and we have + * orcid_pub,_user,_other + */ + assertQ(req("q", "orcid_pub:1111-2222-3333-4444"), + "//doc/int[@name='recid'][.='100']", + "//*[@numFound='1']" + ); + assertQ(req("q", "orcid_pub:1111*"), + "//doc/int[@name='recid'][.='100']", + "//*[@numFound='1']" + ); + assert h.query(req("q", "recid:100", "indent", "false", "fl", "orcid_pub")) + .contains("" + + "1111-2222-3333-4444" + + "-" + + "0000-0002-4110-3511" + ); + // this is only present in orcid_other + assertQ(req("q", "orcid:1111-2222-3333-5555"), + "//doc/int[@name='recid'][.='100']", + "//*[@numFound='1']" + ); + assertQ(req("q", "orcid_other:1111-2222-3333-5555"), + "//doc/int[@name='recid'][.='100']", + "//*[@numFound='1']" + ); diff --git a/contrib/adsabs/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpAdsabs.java b/contrib/adsabs/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpAdsabs.java index 3be8ca74e..b7b165753 100644 --- a/contrib/adsabs/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpAdsabs.java +++ b/contrib/adsabs/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpAdsabs.java @@ -490,9 +490,9 @@ public void testBasics() throws Exception{ assertQueryEquals("-m:(a b NEAR c d AND e)", null, "+m:a +spanNear([m:b, m:c], 5, true) +(+m:d +m:e)"); //? should we allow - at the beginning? assertQueryEquals("m:(a b NEAR2 c)", null, "+m:a +spanNear([m:b, m:c], 2, true)"); - assertQueryEquals("m:(a b NEAR3 c d AND e)", null, "+m:a +spanNear([m:b, m:c], 3, true) +(+m:d +m:e)"); - assertQueryEquals("-m:(a b NEAR4 c d AND e)", null, "+m:a +spanNear([m:b, m:c], 4, true) +(+m:d +m:e)"); - assertQueryNodeException("m:(a b NEAR7 c)"); // by default, only range 1-5 is allowed (in configuration) + assertQueryEquals("m:(a b NEAR3 c d AND e)", null, "+m:a +spanNear([m:b, m:c], 3, true) +(+m:d +m:e)"); + assertQueryEquals("-m:(a b NEAR4 c d AND e)", null, "+m:a +spanNear([m:b, m:c], 4, true) +(+m:d +m:e)"); + assertQueryNodeException("m:(a b NEAR7 c)"); // by default, only range 1-5 is allowed (in configuration) diff --git a/contrib/adsabs/src/test/org/apache/lucene/search/TestCitationsSearch.java b/contrib/adsabs/src/test/org/apache/lucene/search/TestCitationsSearch.java index 3b018aaec..2ceabfa9d 100644 --- a/contrib/adsabs/src/test/org/apache/lucene/search/TestCitationsSearch.java +++ b/contrib/adsabs/src/test/org/apache/lucene/search/TestCitationsSearch.java @@ -32,7 +32,7 @@ @SuppressWarnings({"rawtypes", "unchecked"}) public class TestCitationsSearch extends MontySolrAbstractTestCase { - private boolean debug = true; + private boolean debug = false; private SolrQueryRequest tempReq; @BeforeClass diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeDateString.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeDateString.java index 25d4d2da5..9710e80f9 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeDateString.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeDateString.java @@ -267,7 +267,8 @@ public void test() throws Exception { "defType", "aqp"), "indexstamp:[1349049600000 TO 1638316800000]", null); - assertQ(req("q", "indexstamp:[\"2012-10-01T00:00:00.000\" TO \"2021-12-01T00:00:00.000Z\"]", "indent", "true"), + + assertQ(req("q", "indexstamp:[\"2012-10-01T00:00:00.000\" TO \"2121-12-01T00:00:00.000Z\"]", "indent", "true"), "//*[@numFound='141']" ); diff --git a/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java b/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java index 3b5193883..103873b81 100644 --- a/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java +++ b/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java @@ -138,7 +138,22 @@ public void tearDown() throws Exception { public void testUnfieldedSearch() throws Exception { - + + // NEAR on unfielded search -- will generate error when results have mixed fields + assertQueryParseException(req("defType", "aqp", + "q", "foo NEAR2 bar", + "qf", "bibcode^5 title^10", + "aqp.unfielded.tokens.strategy", "disjuncts", + "aqp.unfielded.tokens.new.type", "simple", + "aqp.constant_scoring", "bibcode^6")); + + assertQueryParseException(req("defType", "aqp", + "q", "foo NEAR2 bar NEAR2 title:baz", + "qf", "bibcode^5 title^10", + "aqp.unfielded.tokens.strategy", "disjuncts", + "aqp.unfielded.tokens.new.type", "simple", + "aqp.constant_scoring", "bibcode^6")); + // when we generate the phrase search, ignore acronyms assertQueryEquals(req("defType", "aqp", "q", "FOO BAR BAZ", @@ -1063,10 +1078,10 @@ public void testSearch() throws Exception { // #375 assertQueryEquals(req("defType", "aqp", "q", "author:\"Civano, F\" -author_facet_hier:(\"Civano, Fa\" OR \"Civano, Da\")"), - "+(author:civano, f | author:civano, f* | author:civano,) -(author_facet_hier:Civano, Fa author_facet_hier:Civano, Da)", + "+(author:civano, f | author:civano, f* | author:civano,) -(author_facet_hier:civano, fa author_facet_hier:civano, da)", BooleanQuery.class); assertQueryEquals(req("defType", "aqp", "q", "author:\"Civano, F\" +author_facet_hier:(\"Civano, Fa\" OR \"Civano, Da\")"), - "+(author:civano, f | author:civano, f* | author:civano,) +(author_facet_hier:Civano, Fa author_facet_hier:Civano, Da)", + "+(author:civano, f | author:civano, f* | author:civano,) +(author_facet_hier:civano, fa author_facet_hier:civano, da)", BooleanQuery.class); assertQueryEquals(req("defType", "aqp", "q", "title:xxx -title:(foo OR bar)"), "+title:xxx -(title:foo title:bar)", diff --git a/contrib/antlrqueryparser/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/SpanConverter.java b/contrib/antlrqueryparser/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/SpanConverter.java index f6b3d08e2..cbfb4be00 100644 --- a/contrib/antlrqueryparser/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/SpanConverter.java +++ b/contrib/antlrqueryparser/src/java/org/apache/lucene/queryparser/flexible/aqp/builders/SpanConverter.java @@ -16,6 +16,7 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; @@ -53,6 +54,8 @@ public SpanQuery getSpanQuery(SpanConverterContainer container) return wrapBoost((SpanQuery) q, boost); } else if (q instanceof TermQuery) { return wrapBoost(new SpanTermQuery(((TermQuery) q).getTerm()), boost); + } else if (q instanceof ConstantScoreQuery) { + return getSpanQuery(new SpanConverterContainer(((ConstantScoreQuery) q).getQuery(), 1, true, 0.0f)); } else if (q instanceof WildcardQuery) { return wrapBoost(new SpanMultiTermQueryWrapper((WildcardQuery) q), boost); } else if (q instanceof PrefixQuery) { @@ -62,16 +65,16 @@ public SpanQuery getSpanQuery(SpanConverterContainer container) } else if (q instanceof BooleanQuery) { return wrapBoost(convertBooleanToSpan(container), boost); } else if (q instanceof RegexpQuery) { - return wrapBoost(new SpanMultiTermQueryWrapper((RegexpQuery) q), boost); + return wrapBoost(new SpanMultiTermQueryWrapper((RegexpQuery) q), boost); } else if (q instanceof DisjunctionMaxQuery) { - return wrapBoost(convertDisjunctionQuery(container), boost); + return wrapBoost(convertDisjunctionQuery(container), boost); } else if (q instanceof BoostQuery) { - return wrapBoost(getSpanQuery(new SpanConverterContainer(((BoostQuery) q).getQuery(), 1, true)), + return wrapBoost(getSpanQuery(new SpanConverterContainer(((BoostQuery) q).getQuery(), 1, true)), ((BoostQuery) q).getBoost()); } else if (q instanceof MatchNoDocsQuery) { - return new EmptySpanQuery(container.query); + return new EmptySpanQuery(container.query); } else if (q instanceof SynonymQuery) { - return wrapBoost(convertSynonymToSpan(container), boost); + return wrapBoost(convertSynonymToSpan(container), boost); } else { SpanQuery wrapped = wrapNonConvertible(container); @@ -113,6 +116,7 @@ private SpanQuery wrapBoost(SpanQuery q, float boost) { return new SpanBoostQuery(q, boost); return q; } + public SpanQuery wrapNonConvertible(SpanConverterContainer container) { if (wrapNonConvertible) { return doWrapping(container); @@ -150,6 +154,7 @@ protected SpanQuery convertBooleanToSpan(SpanConverterContainer container) List clauses = q.clauses(); SpanQuery[] spanClauses = new SpanQuery[clauses.size()]; + String field = null; Occur o = null; int i = 0; for (BooleanClause c : clauses) { @@ -164,21 +169,30 @@ protected SpanQuery convertBooleanToSpan(SpanConverterContainer container) Query sq = c.getQuery(); SpanQuery result = getSpanQuery(new SpanConverterContainer(sq, 1, true)); spanClauses[i] = result; + i++; } - - if (o.equals(Occur.MUST)) { - return new SpanNearQuery(spanClauses, container.slop, - container.inOrder); - } else if (o.equals(Occur.SHOULD)) { - return new SpanOrQuery(spanClauses); - } else if (o.equals(Occur.MUST_NOT)) { - SpanQuery[] exclude = new SpanQuery[spanClauses.length - 1]; - for (int j = 1; j < spanClauses.length; j++) { - exclude[j - 1] = spanClauses[j]; - } - return new SpanNotQuery(spanClauses[0], new SpanOrQuery(exclude)); + + try { + if (o.equals(Occur.MUST)) { + return new SpanNearQuery(spanClauses, container.slop, + container.inOrder); + } else if (o.equals(Occur.SHOULD)) { + return new SpanOrQuery(spanClauses); + } else if (o.equals(Occur.MUST_NOT)) { + SpanQuery[] exclude = new SpanQuery[spanClauses.length - 1]; + for (int j = 1; j < spanClauses.length; j++) { + exclude[j - 1] = spanClauses[j]; + } + return new SpanNotQuery(spanClauses[0], new SpanOrQuery(exclude)); + } } + catch (IllegalArgumentException exc) { + throw new QueryNodeException(new MessageImpl( + QueryParserMessages.LUCENE_QUERY_CONVERSION_ERROR, q.toString(), + "Proximity searches must be executed against the same field; please specify the field explicitly")); + } + throw new QueryNodeException(new MessageImpl( QueryParserMessages.LUCENE_QUERY_CONVERSION_ERROR, q.toString(), diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index 135e1e1e6..3ec97fd0f 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -1246,18 +1246,18 @@ type="normalized_text_ascii_notokenization" indexed="true" stored="true" multiValued="true" omitNorms="true" /> - - - diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml b/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml index 89cd26c19..f6973a967 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml @@ -329,6 +329,7 @@ Modified qf: old: first_author^5 author^2 title^1.5 abstract^1.3 identifier^1 bibstem^1 year^2 new: first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 + --> first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 @@ -361,7 +362,7 @@ explicit 10 - first_author^5 author^2 title^1.5 abstract^1.3 identifier^1 bibstem^1 year^2 + first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 aqp disjuncts simple @@ -418,7 +419,7 @@ Make sure these defaults are set also in other public query handlers (e.g. tvrh - used by the word cloud) --> - first_author^5 author^2 title^1.5 abstract^1.3 identifier^1 bibstem^1 year^2 + first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 aqp disjuncts simple diff --git a/contrib/examples/src/test/examples/adsabs/BlackBoxAdslabsDeploymentVerification.java b/contrib/examples/src/test/examples/adsabs/BlackBoxAdslabsDeploymentVerification.java index 815d10fd0..006759886 100644 --- a/contrib/examples/src/test/examples/adsabs/BlackBoxAdslabsDeploymentVerification.java +++ b/contrib/examples/src/test/examples/adsabs/BlackBoxAdslabsDeploymentVerification.java @@ -57,7 +57,7 @@ public void testUpdates() throws Exception { assertU(commit("waitSearcher", "true")); - // the first search is not auto-warmed (the code seems + // the first search is not auto-warmed (the code seems // that seems like a SOLR bug (I checked the SolrIndexSearcher // code and it is right; so i created own function for // warming warm_cache() @@ -71,9 +71,9 @@ public void testUpdates() throws Exception { "q","bibcode:b*", "fq","{!bitset compression=none}"); List streams = new ArrayList(1); - ContentStreamBase cs = new ContentStreamBase.StringStream("bibcode\nb2\nx5"); - cs.setContentType("big-query/csv"); - streams.add(cs); + ContentStreamBase cs = new ContentStreamBase.StringStream("bibcode\nb2\nx5"); + cs.setContentType("big-query/csv"); + streams.add(cs); req.setContentStreams(streams); assertQ(req