Skip to content

Commit

Permalink
Majority of the test cases pass; but query variations must be applied…
Browse files Browse the repository at this point in the history
… conditionally
  • Loading branch information
romanchyla committed Jan 26, 2021
1 parent f509f83 commit ada6e2e
Show file tree
Hide file tree
Showing 9 changed files with 392 additions and 112 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ private QueryNode expandNodes(QueryNode node, NameInfo origNameInfo, int[] level
List<QueryNode> children = node.getChildren();
boolean changed = false;
for (int i=0;i<children.size();i++) {

QueryNode n = children.get(i);
if (n.getTag(AqpAnalyzerQueryNodeProcessor.TYPE_ATTRIBUTE) == "AUTHOR_QUERY_VARIANT")
continue;
doExpansion(origNameInfo, children.get(i), collector, level);

// interlacing new values right behind the old values
Expand Down Expand Up @@ -175,6 +179,8 @@ private void doExpansion(NameInfo origNameInfo, QueryNode node, List<QueryNode>
if (nameParts.length < origNameInfo.noOfParts ) return; // do nothing

if (origNameInfo.containsOnlySurname) { // orig was lone surname
// do nothing if the input has been generated by variations
// do something if it was user input or synonym
parentChildren.add(new PrefixWildcardQueryNode(fqn.getField(), v + "*", fqn.getBegin(), fqn.getEnd()));
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public boolean incrementToken() throws IOException {

private boolean genVariations() {
String authorName = termAtt.toString();

//log.debug("generating variations for " + authorName);
HashSet<String> variations = AuthorQueryVariations.getQueryVariationsInclRegex(authorName);
if (variations.size() > 0) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis.author;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

/**
* A TokenFilter which filters out Tokens at the same position and Term text as the previous token in the stream.
* It recognizes author query variations and replaces them with the same terms (that were generated as synonyms,
* or transliterations)
*/
public final class AuthorRemoveDuplicatesTokenFilter extends TokenFilter {
private Map<String, Map<String, AttributeSource.State>> cache = null;
private Iterator<Entry<String, Map<String, State>>> iterator = null;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private AttributeSource.State finalState;
private Map<String, Integer> inputTypes;

/**
* Create a new CachingTokenFilter around <code>input</code>. As with
* any normal TokenFilter, do <em>not</em> call reset on the input; this filter
* will do it normally.
* @param inputTypes
*/
public AuthorRemoveDuplicatesTokenFilter(TokenStream input, Map<String, Integer> inputTypes) {
super(input);
this.inputTypes = inputTypes;
}

@Override
public void reset() throws IOException {
input.reset();
if (cache != null) {
cache.clear();
}
else {
cache = new HashMap<String, Map<String, State>>();
}
iterator = null;
}

/** The first time called, it'll read and cache all tokens from the input. */
@Override
public final boolean incrementToken() throws IOException {
if (iterator == null) {
fillCache();
iterator = cache.entrySet().iterator();
}


if (!iterator.hasNext()) {
// the cache is exhausted, return false
return false;
}
restoreTypedState(iterator.next());
return true;
}

private void restoreTypedState(Entry<String, Map<String, State>> state) {
Map<String, State> types = state.getValue();
State saved = null;
int currState = -1;

// only one state will be resurrected (the one with highest priority)
for (Entry<String, State> s: types.entrySet()) {
Integer p = inputTypes.getOrDefault(s.getKey(), -1);
if (p > currState || saved == null) {
saved = s.getValue();
currState = p;
}
}
restoreState(saved);
}

@Override
public final void end() {
if (finalState != null) {
restoreState(finalState);
}
}


private void fillCache() throws IOException {
while (input.incrementToken()) {

String term = termAtt.toString();

if (!cache.containsKey(term)) {
cache.put(term, new HashMap<String, AttributeSource.State>());
}
Map<String, State> types = cache.get(term);
types.put(typeAtt.type(), captureState());
}
// capture final state
input.end();
finalState = captureState();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis.author;


import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.solr.common.util.StrUtils;

/**
* Factory for {@link RemoveDuplicatesTokenFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_rmdup" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @since 3.1
*/
public class AuthorRemoveDuplicatesTokenFilterFactory extends TokenFilterFactory {

private List<String> tokenTypes = null;

/** Creates a new RemoveDuplicatesTokenFilterFactory */
public AuthorRemoveDuplicatesTokenFilterFactory(Map<String,String> args) {
super(args);
if (args.containsKey("tokenTypes")) {
tokenTypes = StrUtils.splitSmart(args.remove("tokenTypes"), ",", false);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}

@Override
public AuthorRemoveDuplicatesTokenFilter create(TokenStream input) {
Map<String, Integer> inputTypes = new HashMap<String, Integer>();
if (tokenTypes != null) {
int i = tokenTypes.size();
for (String s: tokenTypes) {
inputTypes.put(s, i--);
}
}
return new AuthorRemoveDuplicatesTokenFilter(input, inputTypes);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.io.Reader;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -137,7 +138,7 @@ public void add(Reader in) throws IOException, ParseException {
if (seen.contains(shortForm)) continue;
seen.add(shortForm);
newBr.append(escape(shortForm) + "=>" +
sides[0] + "," +
escape(sides[0]) + "," +
buildLine(names));
newBr.append("\n");
}
Expand Down Expand Up @@ -482,7 +483,8 @@ public void add(Reader in) throws IOException, ParseException {
String[] sides = line.split("=>");
if (sides.length > 1) { // explicit mapping
String[] names = getNames(sides[1]);
newBr.append(escape(names[0]));
Arrays.sort(names); // from shortest to longest
newBr.append(escape(sides[0]));
newBr.append("=>");
boolean first = false;
for (String n: names) {
Expand All @@ -494,6 +496,7 @@ public void add(Reader in) throws IOException, ParseException {
}
else {
String[] names = getNames(sides[0]);
Arrays.sort(names); // from shortest to longest (important if we want to see synonyms before automatically generated query variants)
newBr.append(buildLine(names));
}
newBr.append("\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ public void setDebug(boolean v) {
tp.setDebug(v);
}

public boolean getDebug() {
return tp.getDebug();
}

/*
* This is only for printing/debugging, DO NOT use this for testing!!!
*
Expand Down
Loading

0 comments on commit ada6e2e

Please sign in to comment.