Skip to content

Commit

Permalink
Almost converted the multi-synonyms class
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed May 14, 2014
1 parent a8758ed commit c06fbad
Show file tree
Hide file tree
Showing 23 changed files with 257 additions and 244 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@

public class NewSemicolonSynonymParser extends NewSynonymFilterFactory.SynonymParser {
private final boolean expand;
private final Analyzer analyzer;

public NewSemicolonSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup);
super(dedup, analyzer);
this.expand = expand;
this.analyzer = analyzer;
}

public void add(Reader in) throws IOException, ParseException {
Expand Down Expand Up @@ -52,19 +50,19 @@ private void addInternal(BufferedReader in) throws IOException {
String inputStrings[] = split(sides[0], ";");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, inputStrings[i].trim(), new CharsRef());
inputs[i] = analyze(inputStrings[i].trim(), new CharsRef());
}

String outputStrings[] = split(sides[1], ";");
outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
outputs[i] = analyze(analyzer, outputStrings[i].trim(), new CharsRef());
outputs[i] = analyze(outputStrings[i].trim(), new CharsRef());
}
} else {
String inputStrings[] = split(line, ";");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, inputStrings[i].trim(), new CharsRef());
inputs[i] = analyze(inputStrings[i].trim(), new CharsRef());
}
if (expand) {
outputs = inputs;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,10 @@
*/
public class NewSolrSynonymParser extends NewSynonymFilterFactory.SynonymParser {
private final boolean expand;
private final Analyzer analyzer;

public NewSolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup);
super(dedup, analyzer);
this.expand = expand;
this.analyzer = analyzer;
}

public void add(Reader in) throws IOException, ParseException {
Expand Down Expand Up @@ -96,19 +94,19 @@ private void addInternal(BufferedReader in) throws IOException {
String inputStrings[] = split(sides[0], ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
}

String outputStrings[] = split(sides[1], ",");
outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRef());
}
} else {
String inputStrings[] = split(line, ",");
inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRef());
}
if (expand) {
outputs = inputs;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -95,23 +97,26 @@ public void inform(ResourceLoader loader) throws IOException {
}


public static class SynonymParser extends SynonymMap.Builder {
public static class SynonymParser extends SynonymMap.Parser {

public SynonymParser(boolean dedup) {
super(dedup);
public SynonymParser(boolean dedup, Analyzer analyzer) {
super(dedup, analyzer);
}

public void add(Reader in) throws IOException, ParseException {
throw new IllegalAccessError("You must override this method");
}

@Override
public void parse(Reader in) throws IOException, ParseException {}
}


public static class SynonymBuilderFactory extends TokenizerFactory implements ResourceLoaderAware {

protected Map<String,String> args;

protected SynonymBuilderFactory(Map<String,String> args) {
public SynonymBuilderFactory(Map<String,String> args) {
super(args);
this.args = args;
}
Expand Down Expand Up @@ -188,10 +193,15 @@ protected SynonymParser getParser(Analyzer analyzer) {

// (there are no tests for this functionality)
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException {
TokenizerFactory tokFactory = TokenizerFactory.forName(cname, this.args);
tokFactory.setExplicitLuceneMatchVersion(true);
if (tokFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokFactory).inform(loader);
Class<? extends TokenizerFactory> clazz = loader.findClass(cname, TokenizerFactory.class);
TokenizerFactory tokFactory;
try {
tokFactory = clazz.getConstructor(Map.class).newInstance(new HashMap<String, String>());
if (tokFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokFactory).inform(loader);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return tokFactory;
}
Expand All @@ -203,14 +213,20 @@ public void inform(ResourceLoader loader) throws IOException {

}

private SynonymBuilderFactory loadBuilderFactory(ResourceLoader loader, String cname) throws IOException {
TokenizerFactory builderFactory = TokenizerFactory.forName(cname, args);
builderFactory.setExplicitLuceneMatchVersion(true);
if (builderFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) builderFactory).inform(loader);
}
return (SynonymBuilderFactory) builderFactory;
}

//(there are no tests for this functionality)
private SynonymBuilderFactory loadBuilderFactory(ResourceLoader loader, String cname) throws IOException {
Class<? extends SynonymBuilderFactory> clazz = loader.findClass(cname, SynonymBuilderFactory.class);
try {
SynonymBuilderFactory tokFactory = clazz.getConstructor(Map.class).newInstance(args);
if (tokFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokFactory).inform(loader);
}
return tokFactory;
} catch (Exception e) {
throw new RuntimeException(e);
}
}


/*
Expand All @@ -231,7 +247,7 @@ private SynonymBuilderFactory loadBuilderFactory(ResourceLoader loader, String c
* 2: telescope
*/
public static class AlwaysIncludeOriginal extends SynonymBuilderFactory {
protected AlwaysIncludeOriginal(Map<String,String> args) {
public AlwaysIncludeOriginal(Map<String,String> args) {
super(args);
}

Expand Down Expand Up @@ -286,7 +302,7 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
* 4: was
*/
public static class MultiTokenReplaceNulls extends SynonymBuilderFactory {
protected MultiTokenReplaceNulls(Map<String,String> args) {
public MultiTokenReplaceNulls(Map<String,String> args) {
super(args);
}

Expand All @@ -313,7 +329,7 @@ public void add(CharsRef input, CharsRef output, boolean includeOrig) {
*/
public static class BestEffortSearchLowercase extends SynonymBuilderFactory {
private Map<String,String> args;
protected BestEffortSearchLowercase(Map<String,String> args) {
public BestEffortSearchLowercase(Map<String,String> args) {
super(args);
this.args = args;
}
Expand Down Expand Up @@ -355,7 +371,7 @@ private CharsRef lowercase(CharsRef chars) {
*/
public static class BestEffortIgnoreCaseSelectively extends SynonymBuilderFactory {
private Map<String,String> args;
protected BestEffortIgnoreCaseSelectively(Map<String,String> args) {
public BestEffortIgnoreCaseSelectively(Map<String,String> args) {
super(args);
this.args = args;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class NewWordnetSynonymParser extends NewSynonymFilterFactory.SynonymPars
private final Analyzer analyzer;

public NewWordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup);
super(dedup, analyzer);
this.expand = expand;
this.analyzer = analyzer;
}
Expand Down Expand Up @@ -89,7 +89,7 @@ private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
int end = line.lastIndexOf('\'');

String text = line.substring(start, end).replace("''", "'");
return analyze(analyzer, text, reuse);
return analyze(text, reuse);
}

private void addInternal(CharsRef synset[], int size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.TokenFilterFactory;
Expand All @@ -32,9 +33,11 @@
import org.apache.lucene.queryparser.flexible.standard.nodes.PrefixWildcardQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode;
import org.apache.solr.analysis.author.AuthorNormalizeFilter;
import org.apache.solr.analysis.author.AuthorNormalizeFilterFactory;
import org.apache.solr.analysis.author.AuthorUtils;
import org.apache.solr.analysis.author.PythonicAuthorNormalizeFilterFactory;
import org.apache.solr.analysis.author.PythonicAuthorNormalizerFilter;

/**
* Looks at the QueryNode(s) and if they are author searches,
Expand Down Expand Up @@ -270,33 +273,28 @@ private boolean isLongForm(String name) {
* chain, you should always review also this method
*/

private TokenStreamComponents tsc = null;
private ReusableStringReader reader = null;
Analyzer authorNameAnalyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new KeywordTokenizer(reader);
TokenStream filter = new PythonicAuthorNormalizerFilter(source);
filter = new AuthorNormalizeFilter(filter);
return new TokenStreamComponents(source, filter);
}
};

private List<String> normalizeAuthorName(String input) throws QueryNodeException {
if (reader == null) { // well, nice try, but it will be always created new...
TokenFilterFactory[] filters = new TokenFilterFactory[2];
TokenizerFactory tokenizer = new KeywordTokenizerFactory(new HashMap<String,String>());
filters[1] = new AuthorNormalizeFilterFactory(new HashMap<String, String>());
filters[0] = new PythonicAuthorNormalizeFilterFactory(new HashMap<String, String>());
reader = new ReusableStringReader();
Tokenizer tk = tokenizer.create( reader );
TokenStream ts = tk;
for (TokenFilterFactory filter : filters) {
ts = filter.create(ts);
}
tsc = new TokenStreamComponents(tk, ts);
}

TokenStream ts = tsc.getTokenStream();
reader.setValue(input);

try {
TokenStream ts = authorNameAnalyzer.tokenStream("foo", input);
ts.reset();
List<String> out = new ArrayList<String>();
CharTermAttribute termAtt;
while (ts.incrementToken()) {
termAtt = ts.getAttribute(CharTermAttribute.class);
out.add(termAtt.toString());
}
ts.close();
return out;
} catch (IOException e) {
throw new QueryNodeException(new MessageImpl("Error parsing: " + input, e));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ public PersistingMapTokenFilterFactory(Map<String, String> args) {
if (args.containsKey("syntax")) {
this.syntax = args.remove("syntax");
}

if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameter(s): " + args);
}
}

public void inform(ResourceLoader loader) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,20 @@ public class AuthorCollectorFactory extends PersistingMapTokenFilterFactory {
public AuthorCollectorFactory(Map<String, String> args) {
super(args);
if (args.containsKey("tokenTypes")) {
tokenTypes = StrUtils.splitSmart(args.get("tokenTypes"), ",", false);
tokenTypes = StrUtils.splitSmart(args.remove("tokenTypes"), ",", false);
}
else {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tokenType parameter missing");
}
emitTokens = false;
if (args.containsKey("emitTokens")) {
if (((String) args.get("emitTokens")).equals("true")) {
if (((String) args.remove("emitTokens")).equals("true")) {
emitTokens = true;
}
}

if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameter(s): " + args);
}
}

/* (non-Javadoc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ public final class AuthorCollectorFilter extends TokenFilter {
private Set<String> tokenBuffer;
private Set<String> tokenTypes;
private String authorInput;
private int resetCounter;

public AuthorCollectorFilter(TokenStream input, WriteableSynonymMap synMap) {
super(input);
Expand All @@ -47,7 +46,6 @@ public AuthorCollectorFilter(TokenStream input, WriteableSynonymMap synMap) {
tokenBuffer = new LinkedHashSet<String>();
tokenTypes = new HashSet<String>();
this.synMap = synMap;
resetCounter = 0;
}


Expand All @@ -56,7 +54,6 @@ public AuthorCollectorFilter(TokenStream input, WriteableSynonymMap synMap) {
*/
@Override
public boolean incrementToken() throws IOException {
resetCounter = 0;

if (!input.incrementToken()) {
return false;
Expand Down Expand Up @@ -103,11 +100,18 @@ private void addTokensToSynMap() {
@Override
public void reset() throws IOException {
super.reset();
}

@Override
public void end() throws IOException {
super.end();
addTokensToSynMap();
resetCounter++;
if (resetCounter > 2) {
synMap.persist();
}
}

@Override
public void close() throws IOException {
synMap.persist();
super.close();
}

public void setEmitTokens(boolean b) {
Expand Down
Loading

0 comments on commit c06fbad

Please sign in to comment.