Closes adsabs#119

Author tokenizer chain was interfering with wildcards (adding comma) While fixing it, I have found another problem and fixed it - allowing slow fuzzy query (foo~0.8)
romanchyla · Dec 31, 2019 · 02a6c7b · 02a6c7b
1 parent 2304371
commit 02a6c7b
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 17 deletions.
diff --git a/.../apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsCarefulAnalyzerProcessor.java b/.../apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsCarefulAnalyzerProcessor.java
@@ -5,12 +5,15 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
 import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsRegexQueryNode;
+import org.apache.lucene.queryparser.flexible.aqp.nodes.SlowFuzzyQueryNode;
+import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler;
 import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
 import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler;
 import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
@@ -75,7 +78,7 @@ protected QueryNode postProcessNode(QueryNode node)
 	    if (value.indexOf('?') > -1) {
 	      qmarkPosition = value.indexOf('?');
 	    }
-
+	    // if wildcard in the middle, we can't deal with it. return
 	    if (asteriskPosition > 0 && asteriskPosition+1 < value.length()
 	        || qmarkPosition > 0 && qmarkPosition+1 < value.length()
 	        || asteriskPosition > -1 && qmarkPosition > -1)
@@ -126,12 +129,28 @@ else if(node instanceof FuzzyQueryNode) {
           if (tokens.length > 1)
             return node; // break, let the analyzer decide the fate
 
+
           if (!tokens[0].equals(value)) {
-            return new FuzzyQueryNode(field, 
-                tokens[0], 
-              ((FuzzyQueryNode)node).getSimilarity(),
-              ((FuzzyQueryNode)node).getBegin(),
-              ((FuzzyQueryNode)node).getEnd());
+
+          	QueryConfigHandler config = getQueryConfigHandler();
+          	Map<String, String> args = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER);
+
+
+          	if (node.getClass().equals(SlowFuzzyQueryNode.class) 
+          			&& args != null && args.getOrDefault("aqp.allow.slow.fuzzy", "true") == "true") {
+          		return new SlowFuzzyQueryNode(field, 
+          				tokens[0], 
+          				((FuzzyQueryNode)node).getSimilarity(),
+          				((FuzzyQueryNode)node).getBegin(),
+          				((FuzzyQueryNode)node).getEnd());
+          	}
+          	else {
+          		return new FuzzyQueryNode(field, 
+          				tokens[0], 
+          				((FuzzyQueryNode)node).getSimilarity(),
+          				((FuzzyQueryNode)node).getBegin(),
+          				((FuzzyQueryNode)node).getEnd());          		
+          	}
           }
         }
 	    }

diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java
@@ -42,6 +42,7 @@ public static String normalizeAuthor(String a) {
 	}
 
 	public static String normalizeAuthor(String a, boolean keepApostrophe) {
+		boolean hasWildcards = a.indexOf('*') > -1 || a.indexOf('?') > -1; // \*\? should never be encountered here 
 	  if (!keepApostrophe)
 	    a = n4.matcher(a).replaceAll("-");
     a = n0.matcher(a).replaceAll(" ");
@@ -53,7 +54,7 @@ public static String normalizeAuthor(String a, boolean keepApostrophe) {
     a = n2.matcher(a.trim()).replaceAll(" ");
 
 
-    if (!(a.contains(","))) // || a.contains(" ")
+    if (!hasWildcards && !(a.contains(","))) // || a.contains(" ")
       a = a + ",";
     // do this at the end, we want to see the space instead of '-'
     a = a.replace('-', ' ');

diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/PythonicAuthorNormalizerFilter.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/PythonicAuthorNormalizerFilter.java
@@ -65,6 +65,12 @@ public boolean incrementToken() throws IOException {
 
     for (String individual: original.split(";")) {
 
+    	// skip processing wildcards
+    	if (individual.indexOf('*') > -1 || individual.indexOf('?') > -1) {
+    		buffer.add(individual);
+    		continue;
+    	}
+
     	Map<String,String> parsedName = jythonParser.parse_human_name(individual);
 
     	if (parsedName != null) {

diff --git a/contrib/adsabs/src/java/org/apache/solr/search/CitationLRUCache.java b/contrib/adsabs/src/java/org/apache/solr/search/CitationLRUCache.java
@@ -698,7 +698,7 @@ public void set(int docbase, int docid, Object value) {
 	 * Given the set of fields, we'll look inside them and retrieve (into memory)
 	 * all values
 	 */
-	private void unInvertedTheDamnThing(SolrIndexSearcher searcher, List<String> fields, KVSetter setter)
+	private void unInvertedTheDamnThing(SolrIndexSearcher searcher, List<String> fields, final KVSetter setter)
 			throws IOException {
 
 		IndexSchema schema = searcher.getCore().getLatestSchema();
@@ -712,7 +712,7 @@ private void unInvertedTheDamnThing(SolrIndexSearcher searcher, List<String> fie
 			liveDocs = leave.reader().getLiveDocs();
 			lr = leave.reader();
 			FieldInfos fInfo = lr.getFieldInfos();
-			for (String field : fields) {
+			for (final String field : fields) {
 
 				FieldInfo fi = fInfo.fieldInfo(field);
 

diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java
@@ -25,7 +25,9 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.WildcardQuery;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.search.QParser;
@@ -374,6 +376,13 @@ public void xtestX() throws Exception {
   }
 
   public void testAuthorParsingUseCases() throws Exception {
+
+  	assertQueryEquals(req("q", "author:acco*"), "author:acco*", WildcardQuery.class);
+  	assertQueryEquals(req("q", "author:Adamč*"), "author:adamč*", WildcardQuery.class);
+
+  	testAuthorQuery("Adamč*",
+  			"adamč*",
+  			"//*[@numFound='11']");
 
     // multiple synonyms in the file are separated with semicolon
     testAuthorQuery("\"wyrzykowsky, l\"",
@@ -392,14 +401,14 @@ public void testAuthorParsingUseCases() throws Exception {
         "//*[@numFound='1']");
 
     // should not find anything, even though the names are there indexed next to each other
-    assertQ(req("q", "author:\"foo, * other, *\"", "debugQuery", "true"),
+    assertQ(req("q", "author:\"foo, * other, *\""),
         "//*[@numFound='0']"
     );
-    assertQ(req("q", "author:\"foo, *\"", "debugQuery", "true"),
+    assertQ(req("q", "author:\"foo, *\""),
         "//*[@numFound='1']",
         "//doc/int[@name='recid'][.='600']"
     );
-    assertQ(req("q", "author:\"other, *\"", "debugQuery", "true"),
+    assertQ(req("q", "author:\"other, *\""),
         "//*[@numFound='1']",
         "//doc/int[@name='recid'][.='600']"
     );

diff --git a/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java b/contrib/adsabs/src/test/org/apache/solr/search/TestAqpAdsabsSolrSearch.java
@@ -12,6 +12,7 @@
 import org.apache.lucene.queries.CustomScoreQuery;
 import org.apache.lucene.queries.mlt.MoreLikeThisQuery;
 import org.apache.lucene.queryparser.flexible.aqp.TestAqpAdsabs;
+import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
@@ -586,14 +587,14 @@ public void testSpecialCases() throws Exception {
 
       // levenshtein automata only considers distances (and max is 2)
       assertQueryEquals(req("defType", "aqp", "q", "=author:\"Hoffmann, W.\"~0.8"),
-          "author:hoffmann, w~2",
-          FuzzyQuery.class);
+          "author:hoffmann, w~0.8",
+          SlowFuzzyQuery.class);
       assertQueryEquals(req("defType", "aqp", "q", "=author:\"Hoffmann, W.\"~3"),
           "author:hoffmann, w~2",
           FuzzyQuery.class);
       assertQueryEquals(req("defType", "aqp", "q", "=author:\"Hoffmann, W.\"~1"),
-          "author:hoffmann, w~1",
-          FuzzyQuery.class);
+          "author:hoffmann, w~1.0",
+          SlowFuzzyQuery.class);
 
 
       assertQueryEquals(req("defType", "aqp", "q", "author:\"Hoffmann, W.\"~2"),

diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
@@ -89,7 +89,6 @@
 			   <filter class="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory"
                 synonyms="author_curated.synonyms" format="semicolon"
                 ignoreCase="true" expand="true" tokenizerFactory="solr.KeywordTokenizerFactory" />
-
 				<!-- generate combinations to find their upgraded form -->
 				<filter
 					class="solr.analysis.author.AuthorCreateQueryVariationsFilterFactory"