Skip to content

Commit

Permalink
Closes adsabs#119
Browse files Browse the repository at this point in the history
Author tokenizer chain was interfering with wildcards (adding comma)
While fixing it, I have found another problem and fixed it - allowing slow fuzzy
query (foo~0.8)
  • Loading branch information
romanchyla committed Dec 31, 2019
1 parent 2304371 commit 02a6c7b
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsRegexQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.nodes.SlowFuzzyQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
Expand Down Expand Up @@ -75,7 +78,7 @@ protected QueryNode postProcessNode(QueryNode node)
if (value.indexOf('?') > -1) {
qmarkPosition = value.indexOf('?');
}

// if wildcard in the middle, we can't deal with it. return
if (asteriskPosition > 0 && asteriskPosition+1 < value.length()
|| qmarkPosition > 0 && qmarkPosition+1 < value.length()
|| asteriskPosition > -1 && qmarkPosition > -1)
Expand Down Expand Up @@ -126,12 +129,28 @@ else if(node instanceof FuzzyQueryNode) {
if (tokens.length > 1)
return node; // break, let the analyzer decide the fate


if (!tokens[0].equals(value)) {
return new FuzzyQueryNode(field,
tokens[0],
((FuzzyQueryNode)node).getSimilarity(),
((FuzzyQueryNode)node).getBegin(),
((FuzzyQueryNode)node).getEnd());

QueryConfigHandler config = getQueryConfigHandler();
Map<String, String> args = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER);


if (node.getClass().equals(SlowFuzzyQueryNode.class)
&& args != null && args.getOrDefault("aqp.allow.slow.fuzzy", "true") == "true") {
return new SlowFuzzyQueryNode(field,
tokens[0],
((FuzzyQueryNode)node).getSimilarity(),
((FuzzyQueryNode)node).getBegin(),
((FuzzyQueryNode)node).getEnd());
}
else {
return new FuzzyQueryNode(field,
tokens[0],
((FuzzyQueryNode)node).getSimilarity(),
((FuzzyQueryNode)node).getBegin(),
((FuzzyQueryNode)node).getEnd());
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public static String normalizeAuthor(String a) {
}

public static String normalizeAuthor(String a, boolean keepApostrophe) {
boolean hasWildcards = a.indexOf('*') > -1 || a.indexOf('?') > -1; // \*\? should never be encountered here
if (!keepApostrophe)
a = n4.matcher(a).replaceAll("-");
a = n0.matcher(a).replaceAll(" ");
Expand All @@ -53,7 +54,7 @@ public static String normalizeAuthor(String a, boolean keepApostrophe) {
a = n2.matcher(a.trim()).replaceAll(" ");


if (!(a.contains(","))) // || a.contains(" ")
if (!hasWildcards && !(a.contains(","))) // || a.contains(" ")
a = a + ",";
// do this at the end, we want to see the space instead of '-'
a = a.replace('-', ' ');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ public boolean incrementToken() throws IOException {

for (String individual: original.split(";")) {

// skip processing wildcards
if (individual.indexOf('*') > -1 || individual.indexOf('?') > -1) {
buffer.add(individual);
continue;
}

Map<String,String> parsedName = jythonParser.parse_human_name(individual);

if (parsedName != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ public void set(int docbase, int docid, Object value) {
* Given the set of fields, we'll look inside them and retrieve (into memory)
* all values
*/
private void unInvertedTheDamnThing(SolrIndexSearcher searcher, List<String> fields, KVSetter setter)
private void unInvertedTheDamnThing(SolrIndexSearcher searcher, List<String> fields, final KVSetter setter)
throws IOException {

IndexSchema schema = searcher.getCore().getLatestSchema();
Expand All @@ -712,7 +712,7 @@ private void unInvertedTheDamnThing(SolrIndexSearcher searcher, List<String> fie
liveDocs = leave.reader().getLiveDocs();
lr = leave.reader();
FieldInfos fInfo = lr.getFieldInfos();
for (String field : fields) {
for (final String field : fields) {

FieldInfo fi = fInfo.fieldInfo(field);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.WildcardQuery;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
Expand Down Expand Up @@ -374,6 +376,13 @@ public void xtestX() throws Exception {
}

public void testAuthorParsingUseCases() throws Exception {

assertQueryEquals(req("q", "author:acco*"), "author:acco*", WildcardQuery.class);
assertQueryEquals(req("q", "author:Adamč*"), "author:adamč*", WildcardQuery.class);

testAuthorQuery("Adamč*",
"adamč*",
"//*[@numFound='11']");

// multiple synonyms in the file are separated with semicolon
testAuthorQuery("\"wyrzykowsky, l\"",
Expand All @@ -392,14 +401,14 @@ public void testAuthorParsingUseCases() throws Exception {
"//*[@numFound='1']");

// should not find anything, even though the names are there indexed next to each other
assertQ(req("q", "author:\"foo, * other, *\"", "debugQuery", "true"),
assertQ(req("q", "author:\"foo, * other, *\""),
"//*[@numFound='0']"
);
assertQ(req("q", "author:\"foo, *\"", "debugQuery", "true"),
assertQ(req("q", "author:\"foo, *\""),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='600']"
);
assertQ(req("q", "author:\"other, *\"", "debugQuery", "true"),
assertQ(req("q", "author:\"other, *\""),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='600']"
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.queries.CustomScoreQuery;
import org.apache.lucene.queries.mlt.MoreLikeThisQuery;
import org.apache.lucene.queryparser.flexible.aqp.TestAqpAdsabs;
import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
Expand Down Expand Up @@ -586,14 +587,14 @@ public void testSpecialCases() throws Exception {

// levenshtein automata only considers distances (and max is 2)
assertQueryEquals(req("defType", "aqp", "q", "=author:\"Hoffmann, W.\"~0.8"),
"author:hoffmann, w~2",
FuzzyQuery.class);
"author:hoffmann, w~0.8",
SlowFuzzyQuery.class);
assertQueryEquals(req("defType", "aqp", "q", "=author:\"Hoffmann, W.\"~3"),
"author:hoffmann, w~2",
FuzzyQuery.class);
assertQueryEquals(req("defType", "aqp", "q", "=author:\"Hoffmann, W.\"~1"),
"author:hoffmann, w~1",
FuzzyQuery.class);
"author:hoffmann, w~1.0",
SlowFuzzyQuery.class);


assertQueryEquals(req("defType", "aqp", "q", "author:\"Hoffmann, W.\"~2"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@
<filter class="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory"
synonyms="author_curated.synonyms" format="semicolon"
ignoreCase="true" expand="true" tokenizerFactory="solr.KeywordTokenizerFactory" />

<!-- generate combinations to find their upgraded form -->
<filter
class="solr.analysis.author.AuthorCreateQueryVariationsFilterFactory"
Expand Down

0 comments on commit 02a6c7b

Please sign in to comment.