Majority of the test cases pass; but query variations must be applied…

… conditionally
romanchyla · Jan 26, 2021 · ada6e2e · ada6e2e
1 parent f509f83
commit ada6e2e
Show file tree

Hide file tree

Showing 9 changed files with 392 additions and 112 deletions.
diff --git a/...ache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsExpandAuthorSearchProcessor.java b/...ache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsExpandAuthorSearchProcessor.java
@@ -97,6 +97,10 @@ private QueryNode expandNodes(QueryNode node, NameInfo origNameInfo, int[] level
       List<QueryNode> children = node.getChildren();
       boolean changed = false;
       for (int i=0;i<children.size();i++) {
+
+        QueryNode n = children.get(i);
+        if (n.getTag(AqpAnalyzerQueryNodeProcessor.TYPE_ATTRIBUTE) == "AUTHOR_QUERY_VARIANT")
+          continue;
         doExpansion(origNameInfo, children.get(i), collector, level);
 
         // interlacing new values right behind the old values
@@ -175,6 +179,8 @@ private void doExpansion(NameInfo origNameInfo, QueryNode node, List<QueryNode>
           if (nameParts.length < origNameInfo.noOfParts ) return; // do nothing
 
           if (origNameInfo.containsOnlySurname) { // orig was lone surname
+            // do nothing if the input has been generated by variations
+            // do something if it was user input or synonym
             parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + "*", fqn.getBegin(), fqn.getEnd()));
           }
           else {

diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariationsFilter.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariationsFilter.java
@@ -61,6 +61,7 @@ public boolean incrementToken() throws IOException {
 
 	private boolean genVariations() {
 	    String authorName = termAtt.toString();
+
 	    //log.debug("generating variations for " + authorName);
 	    HashSet<String> variations = AuthorQueryVariations.getQueryVariationsInclRegex(authorName);
 	    if (variations.size() > 0) {

diff --git a/...ib/adsabs/src/java/org/apache/solr/analysis/author/AuthorRemoveDuplicatesTokenFilter.java b/...ib/adsabs/src/java/org/apache/solr/analysis/author/AuthorRemoveDuplicatesTokenFilter.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis.author;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+/**
+ * A TokenFilter which filters out Tokens at the same position and Term text as the previous token in the stream.
+ * It recognizes author query variations and replaces them with the same terms (that were generated as synonyms,
+ * or transliterations)
+ */
+public final class AuthorRemoveDuplicatesTokenFilter extends TokenFilter {
+  private Map<String, Map<String, AttributeSource.State>> cache = null;
+  private Iterator<Entry<String, Map<String, State>>> iterator = null; 
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private AttributeSource.State finalState;
+  private Map<String, Integer> inputTypes;
+
+  /**
+   * Create a new CachingTokenFilter around <code>input</code>. As with
+   * any normal TokenFilter, do <em>not</em> call reset on the input; this filter
+   * will do it normally.
+   * @param inputTypes 
+   */
+  public AuthorRemoveDuplicatesTokenFilter(TokenStream input, Map<String, Integer> inputTypes) {
+    super(input);
+    this.inputTypes = inputTypes;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+    if (cache != null) {
+      cache.clear();
+    }
+    else {
+      cache = new HashMap<String, Map<String, State>>();
+    }
+    iterator = null;
+  }
+
+  /** The first time called, it'll read and cache all tokens from the input. */
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (iterator == null) {
+      fillCache();
+      iterator = cache.entrySet().iterator();      
+    }
+
+
+    if (!iterator.hasNext()) {
+      // the cache is exhausted, return false
+      return false;
+    }
+    restoreTypedState(iterator.next());
+    return true;
+  }
+
+  private void restoreTypedState(Entry<String, Map<String, State>> state) {
+    Map<String, State> types = state.getValue();
+    State saved = null;
+    int currState = -1;
+
+    // only one state will be resurrected (the one with highest priority)
+    for (Entry<String, State> s: types.entrySet()) {
+      Integer p = inputTypes.getOrDefault(s.getKey(), -1);
+      if (p > currState || saved == null) {
+        saved = s.getValue();
+        currState = p;
+      }
+    }
+    restoreState(saved);
+  }
+
+  @Override
+  public final void end() {
+    if (finalState != null) {
+      restoreState(finalState);
+    }
+  }
+
+
+  private void fillCache() throws IOException {
+    while (input.incrementToken()) {
+
+      String term = termAtt.toString();
+
+      if (!cache.containsKey(term)) {
+        cache.put(term, new HashMap<String, AttributeSource.State>());
+      }
+      Map<String, State> types = cache.get(term);
+      types.put(typeAtt.type(), captureState());
+    }
+    // capture final state
+    input.end();
+    finalState = captureState();
+  }
+
+}
diff --git a/...bs/src/java/org/apache/solr/analysis/author/AuthorRemoveDuplicatesTokenFilterFactory.java b/...bs/src/java/org/apache/solr/analysis/author/AuthorRemoveDuplicatesTokenFilterFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis.author;
+
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.solr.common.util.StrUtils;
+
+/**
+ * Factory for {@link RemoveDuplicatesTokenFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_rmdup" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * @since 3.1
+ */
+public class AuthorRemoveDuplicatesTokenFilterFactory extends TokenFilterFactory {
+
+  private List<String> tokenTypes = null;
+
+  /** Creates a new RemoveDuplicatesTokenFilterFactory */
+  public AuthorRemoveDuplicatesTokenFilterFactory(Map<String,String> args) {
+    super(args);
+    if (args.containsKey("tokenTypes")) {
+      tokenTypes = StrUtils.splitSmart(args.remove("tokenTypes"), ",", false);
+    }
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public AuthorRemoveDuplicatesTokenFilter create(TokenStream input) {
+    Map<String, Integer> inputTypes = new HashMap<String, Integer>();
+    if (tokenTypes != null) {
+      int i = tokenTypes.size();
+      for (String s: tokenTypes) {
+        inputTypes.put(s, i--);
+      }      
+    }
+    return new AuthorRemoveDuplicatesTokenFilter(input, inputTypes);
+  }
+}
diff --git a/.../adsabs/src/java/org/apache/solr/analysis/author/AuthorShortNameUpgradeFilterFactory.java b/.../adsabs/src/java/org/apache/solr/analysis/author/AuthorShortNameUpgradeFilterFactory.java
@@ -7,6 +7,7 @@
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.text.ParseException;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -137,7 +138,7 @@ public void add(Reader in) throws IOException, ParseException {
                     if (seen.contains(shortForm)) continue;
                     seen.add(shortForm);
                     newBr.append(escape(shortForm) + "=>" +
-                        sides[0] + "," +
+                        escape(sides[0]) + "," +
                         buildLine(names));
                     newBr.append("\n");
                   }
@@ -482,7 +483,8 @@ public void add(Reader in) throws IOException, ParseException {
               String[] sides = line.split("=>");
               if (sides.length > 1) { // explicit mapping
                 String[] names = getNames(sides[1]);
-                newBr.append(escape(names[0]));
+                Arrays.sort(names); // from shortest to longest
+                newBr.append(escape(sides[0]));
                 newBr.append("=>");
                 boolean first = false;
                 for (String n: names) {
@@ -494,6 +496,7 @@ public void add(Reader in) throws IOException, ParseException {
               }
               else {
                 String[] names = getNames(sides[0]);
+                Arrays.sort(names); // from shortest to longest (important if we want to see synonyms before automatically generated query variants)
                 newBr.append(buildLine(names));
               }
               newBr.append("\n");

diff --git a/contrib/adsabs/src/test/monty/solr/util/MontySolrQueryTestCase.java b/contrib/adsabs/src/test/monty/solr/util/MontySolrQueryTestCase.java
@@ -136,6 +136,10 @@ public void setDebug(boolean v) {
 		tp.setDebug(v);
 	}
 
+	public boolean getDebug() {
+	  return tp.getDebug();
+	}
+
 	/*
 	 * This is only for printing/debugging, DO NOT use this for testing!!!
 	 *