From 10f562925a56ad6483624c502e4089c60d2f1b7e Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Tue, 27 Sep 2016 17:22:11 -0400 Subject: [PATCH] Solved the WDFF problem --- .../miscellaneous/AqpWordDelimiterFilter.java | 682 ++++++++++++++++++ .../AqpWordDelimiterFilterFactory.java | 204 ++++++ .../analysis/DateNormalizerTokenFilter.java | 5 +- .../TestAdsabsTypeFulltextParsing.java | 5 +- .../analysis/TestDateNormalizerFilter.java | 26 +- .../flexible/aqp/TestAqpSLGSimple.java | 24 +- .../adsabs/solr/collection1/conf/schema.xml | 8 +- 7 files changed, 918 insertions(+), 36 deletions(-) create mode 100644 contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java create mode 100644 contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java diff --git a/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java new file mode 100644 index 000000000..a98abdcd1 --- /dev/null +++ b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java @@ -0,0 +1,682 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.InPlaceMergeSorter; + +import java.io.IOException; +import java.util.Arrays; + +/** + * + * LAMENT + * + * Exact copy of a WDDFilter: OK, I'll say it; that filter was written by someone + * with terrible sleep deprivation. It is sorting buffered tokens based on start + * offsets (instead of end offsets; the previous version of the WDDF had that + * behaviour). But what is absolutely outrageous is that the author made the + * 'sorter' protected, so I could have changed it without making this stupid clone. + * Had he declared it to be of a generic type. Of course he didn't! + * + * The *only* change I needed to make is marked by string: SILLYPANTIE + * + * Splits words into subwords and performs optional transformations on subword + * groups. Words are split into subwords with the following rules: + * + * + * The combinations parameter affects how subwords are combined: + * + * One use for {@link WordDelimiterFilter} is to help match words with different + * subword delimiters. For example, if the source text contained "wi-fi" one may + * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so + * is to specify combinations="1" in the analyzer used for indexing, and + * combinations="0" (the default) in the analyzer used for querying. Given that + * the current {@link StandardTokenizer} immediately removes many intra-word + * delimiters, it is recommended that this filter be used after a tokenizer that + * does not do this (such as {@link WhitespaceTokenizer}). + */ +public final class AqpWordDelimiterFilter extends TokenFilter { + + public static final int LOWER = 0x01; + public static final int UPPER = 0x02; + public static final int DIGIT = 0x04; + public static final int SUBWORD_DELIM = 0x08; + + // combinations: for testing, not for setting bits + public static final int ALPHA = 0x03; + public static final int ALPHANUM = 0x07; + + /** + * Causes parts of words to be generated: + *

+ * "PowerShot" => "Power" "Shot" + */ + public static final int GENERATE_WORD_PARTS = 1; + + /** + * Causes number subwords to be generated: + *

+ * "500-42" => "500" "42" + */ + public static final int GENERATE_NUMBER_PARTS = 2; + + /** + * Causes maximum runs of word parts to be catenated: + *

+ * "wi-fi" => "wifi" + */ + public static final int CATENATE_WORDS = 4; + + /** + * Causes maximum runs of word parts to be catenated: + *

+ * "wi-fi" => "wifi" + */ + public static final int CATENATE_NUMBERS = 8; + + /** + * Causes all subword parts to be catenated: + *

+ * "wi-fi-4000" => "wifi4000" + */ + public static final int CATENATE_ALL = 16; + + /** + * Causes original words are preserved and added to the subword list (Defaults to false) + *

+ * "500-42" => "500" "42" "500-42" + */ + public static final int PRESERVE_ORIGINAL = 32; + + /** + * If not set, causes case changes to be ignored (subwords will only be generated + * given SUBWORD_DELIM tokens) + */ + public static final int SPLIT_ON_CASE_CHANGE = 64; + + /** + * If not set, causes numeric changes to be ignored (subwords will only be generated + * given SUBWORD_DELIM tokens). + */ + public static final int SPLIT_ON_NUMERICS = 128; + + /** + * Causes trailing "'s" to be removed for each subword + *

+ * "O'Neil's" => "O", "Neil" + */ + public static final int STEM_ENGLISH_POSSESSIVE = 256; + + /** + * If not null is the set of tokens to protect from being delimited + * + */ + final CharArraySet protWords; + + private final int flags; + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + + // used for iterating word delimiter breaks + private final WordDelimiterIterator iterator; + + // used for concatenating runs of similar typed subwords (word,number) + private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation(); + // number of subwords last output by concat. + private int lastConcatCount = 0; + + // used for catenate all + private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation(); + + // used for accumulating position increment gaps + private int accumPosInc = 0; + + private char savedBuffer[] = new char[1024]; + private int savedStartOffset; + private int savedEndOffset; + private String savedType; + private boolean hasSavedState = false; + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + private boolean hasIllegalOffsets = false; + + // for a run of the same subword type within a word, have we output anything? + private boolean hasOutputToken = false; + // when preserve original is on, have we output any token following it? + // this token must have posInc=0! + private boolean hasOutputFollowingOriginal = false; + + /** + * Creates a new WordDelimiterFilter + * + * @param in TokenStream to be filtered + * @param charTypeTable table containing character types + * @param configurationFlags Flags configuring the filter + * @param protWords If not null is the set of tokens to protect from being delimited + */ + public AqpWordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) { + super(in); + this.flags = configurationFlags; + this.protWords = protWords; + this.iterator = new WordDelimiterIterator( + charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); + } + + /** + * Creates a new WordDelimiterFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE} + * as its charTypeTable + * + * @param in TokenStream to be filtered + * @param configurationFlags Flags configuring the filter + * @param protWords If not null is the set of tokens to protect from being delimited + */ + public AqpWordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) { + this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords); + } + + @Override + public boolean incrementToken() throws IOException { + while (true) { + if (!hasSavedState) { + // process a new input word + if (!input.incrementToken()) { + return false; + } + + int termLength = termAttribute.length(); + char[] termBuffer = termAttribute.buffer(); + + accumPosInc += posIncAttribute.getPositionIncrement(); + + iterator.setText(termBuffer, termLength); + iterator.next(); + + // word of no delimiters, or protected word: just return it + if ((iterator.current == 0 && iterator.end == termLength) || + (protWords != null && protWords.contains(termBuffer, 0, termLength))) { + posIncAttribute.setPositionIncrement(accumPosInc); + accumPosInc = 0; + first = false; + return true; + } + + // word of simply delimiters + if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) { + // if the posInc is 1, simply ignore it in the accumulation + // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous logic! + if (posIncAttribute.getPositionIncrement() == 1 && !first) { + accumPosInc--; + } + continue; + } + + saveState(); + + hasOutputToken = false; + hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL); + lastConcatCount = 0; + + if (has(PRESERVE_ORIGINAL)) { + posIncAttribute.setPositionIncrement(accumPosInc); + accumPosInc = 0; + first = false; + return true; + } + } + + // at the end of the string, output any concatenations + if (iterator.end == WordDelimiterIterator.DONE) { + if (!concat.isEmpty()) { + if (flushConcatenation(concat)) { + buffer(); + continue; + } + } + + if (!concatAll.isEmpty()) { + // only if we haven't output this same combo above! + if (concatAll.subwordCount > lastConcatCount) { + concatAll.writeAndClear(); + buffer(); + continue; + } + concatAll.clear(); + } + + if (bufferedPos < bufferedLen) { + if (bufferedPos == 0) { + // sorter.sort(0, bufferedLen); SILLYPANTIE + } + clearAttributes(); + restoreState(buffered[bufferedPos++]); + if (first && posIncAttribute.getPositionIncrement() == 0) { + // can easily happen with strange combinations (e.g. not outputting numbers, but concat-all) + posIncAttribute.setPositionIncrement(1); + } + first = false; + return true; + } + + // no saved concatenations, on to the next input word + bufferedPos = bufferedLen = 0; + hasSavedState = false; + continue; + } + + // word surrounded by delimiters: always output + if (iterator.isSingleWord()) { + generatePart(true); + iterator.next(); + first = false; + return true; + } + + int wordType = iterator.type(); + + // do we already have queued up incompatible concatenations? + if (!concat.isEmpty() && (concat.type & wordType) == 0) { + if (flushConcatenation(concat)) { + hasOutputToken = false; + buffer(); + continue; + } + hasOutputToken = false; + } + + // add subwords depending upon options + if (shouldConcatenate(wordType)) { + if (concat.isEmpty()) { + concat.type = wordType; + } + concatenate(concat); + } + + // add all subwords (catenateAll) + if (has(CATENATE_ALL)) { + concatenate(concatAll); + } + + // if we should output the word or number part + if (shouldGenerateParts(wordType)) { + generatePart(false); + buffer(); + } + + iterator.next(); + } + } + + @Override + public void reset() throws IOException { + super.reset(); + hasSavedState = false; + concat.clear(); + concatAll.clear(); + accumPosInc = bufferedPos = bufferedLen = 0; + first = true; + } + + // ================================================= Helper Methods ================================================ + + + private AttributeSource.State buffered[] = new AttributeSource.State[8]; + private int startOff[] = new int[8]; + private int posInc[] = new int[8]; + private int bufferedLen = 0; + private int bufferedPos = 0; + private boolean first; + + private class OffsetSorter extends InPlaceMergeSorter { + @Override + protected int compare(int i, int j) { + int cmp = Integer.compare(startOff[i], startOff[j]); + if (cmp == 0) { + cmp = Integer.compare(posInc[j], posInc[i]); + } + return cmp; + } + + @Override + protected void swap(int i, int j) { + AttributeSource.State tmp = buffered[i]; + buffered[i] = buffered[j]; + buffered[j] = tmp; + + int tmp2 = startOff[i]; + startOff[i] = startOff[j]; + startOff[j] = tmp2; + + tmp2 = posInc[i]; + posInc[i] = posInc[j]; + posInc[j] = tmp2; + } + } + + final OffsetSorter sorter = new OffsetSorter(); + + private void buffer() { + if (bufferedLen == buffered.length) { + int newSize = ArrayUtil.oversize(bufferedLen+1, 8); + buffered = Arrays.copyOf(buffered, newSize); + startOff = Arrays.copyOf(startOff, newSize); + posInc = Arrays.copyOf(posInc, newSize); + } + startOff[bufferedLen] = offsetAttribute.startOffset(); + posInc[bufferedLen] = posIncAttribute.getPositionIncrement(); + buffered[bufferedLen] = captureState(); + bufferedLen++; + } + + /** + * Saves the existing attribute states + */ + private void saveState() { + // otherwise, we have delimiters, save state + savedStartOffset = offsetAttribute.startOffset(); + savedEndOffset = offsetAttribute.endOffset(); + // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets. + hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length()); + savedType = typeAttribute.type(); + + if (savedBuffer.length < termAttribute.length()) { + savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), Character.BYTES)]; + } + + System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length()); + iterator.text = savedBuffer; + + hasSavedState = true; + } + + /** + * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing. + * + * @param concatenation WordDelimiterConcatenation that will be flushed + * @return {@code true} if the concatenation was written before it was cleared, {@code false} otherwise + */ + private boolean flushConcatenation(WordDelimiterConcatenation concatenation) { + lastConcatCount = concatenation.subwordCount; + if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) { + concatenation.writeAndClear(); + return true; + } + concatenation.clear(); + return false; + } + + /** + * Determines whether to concatenate a word or number if the current word is the given type + * + * @param wordType Type of the current word used to determine if it should be concatenated + * @return {@code true} if concatenation should occur, {@code false} otherwise + */ + private boolean shouldConcatenate(int wordType) { + return (has(CATENATE_WORDS) && isAlpha(wordType)) || (has(CATENATE_NUMBERS) && isDigit(wordType)); + } + + /** + * Determines whether a word/number part should be generated for a word of the given type + * + * @param wordType Type of the word used to determine if a word/number part should be generated + * @return {@code true} if a word/number part should be generated, {@code false} otherwise + */ + private boolean shouldGenerateParts(int wordType) { + return (has(GENERATE_WORD_PARTS) && isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && isDigit(wordType)); + } + + /** + * Concatenates the saved buffer to the given WordDelimiterConcatenation + * + * @param concatenation WordDelimiterConcatenation to concatenate the buffer to + */ + private void concatenate(WordDelimiterConcatenation concatenation) { + if (concatenation.isEmpty()) { + concatenation.startOffset = savedStartOffset + iterator.current; + } + concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current); + concatenation.endOffset = savedStartOffset + iterator.end; + } + + /** + * Generates a word/number part, updating the appropriate attributes + * + * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise + */ + private void generatePart(boolean isSingleWord) { + clearAttributes(); + termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current); + + int startOffset = savedStartOffset + iterator.current; + int endOffset = savedStartOffset + iterator.end; + + if (hasIllegalOffsets) { + // historically this filter did this regardless for 'isSingleWord', + // but we must do a sanity check: + if (isSingleWord && startOffset <= savedEndOffset) { + offsetAttribute.setOffset(startOffset, savedEndOffset); + } else { + offsetAttribute.setOffset(savedStartOffset, savedEndOffset); + } + } else { + offsetAttribute.setOffset(startOffset, endOffset); + } + posIncAttribute.setPositionIncrement(position(false)); + typeAttribute.setType(savedType); + } + + /** + * Get the position increment gap for a subword or concatenation + * + * @param inject true if this token wants to be injected + * @return position increment gap + */ + private int position(boolean inject) { + int posInc = accumPosInc; + + if (hasOutputToken) { + accumPosInc = 0; + return inject ? 0 : Math.max(1, posInc); + } + + hasOutputToken = true; + + if (!hasOutputFollowingOriginal) { + // the first token following the original is 0 regardless + hasOutputFollowingOriginal = true; + return 0; + } + // clear the accumulated position increment + accumPosInc = 0; + return Math.max(1, posInc); + } + + /** + * Checks if the given word type includes {@link #ALPHA} + * + * @param type Word type to check + * @return {@code true} if the type contains ALPHA, {@code false} otherwise + */ + static boolean isAlpha(int type) { + return (type & ALPHA) != 0; + } + + /** + * Checks if the given word type includes {@link #DIGIT} + * + * @param type Word type to check + * @return {@code true} if the type contains DIGIT, {@code false} otherwise + */ + static boolean isDigit(int type) { + return (type & DIGIT) != 0; + } + + /** + * Checks if the given word type includes {@link #SUBWORD_DELIM} + * + * @param type Word type to check + * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise + */ + static boolean isSubwordDelim(int type) { + return (type & SUBWORD_DELIM) != 0; + } + + /** + * Checks if the given word type includes {@link #UPPER} + * + * @param type Word type to check + * @return {@code true} if the type contains UPPER, {@code false} otherwise + */ + static boolean isUpper(int type) { + return (type & UPPER) != 0; + } + + /** + * Determines whether the given flag is set + * + * @param flag Flag to see if set + * @return {@code true} if flag is set + */ + private boolean has(int flag) { + return (flags & flag) != 0; + } + + // ================================================= Inner Classes ================================================= + + /** + * A WDF concatenated 'run' + */ + final class WordDelimiterConcatenation { + final StringBuilder buffer = new StringBuilder(); + int startOffset; + int endOffset; + int type; + int subwordCount; + + /** + * Appends the given text of the given length, to the concetenation at the given offset + * + * @param text Text to append + * @param offset Offset in the concetenation to add the text + * @param length Length of the text to append + */ + void append(char text[], int offset, int length) { + buffer.append(text, offset, length); + subwordCount++; + } + + /** + * Writes the concatenation to the attributes + */ + void write() { + clearAttributes(); + if (termAttribute.length() < buffer.length()) { + termAttribute.resizeBuffer(buffer.length()); + } + char termbuffer[] = termAttribute.buffer(); + + buffer.getChars(0, buffer.length(), termbuffer, 0); + termAttribute.setLength(buffer.length()); + + if (hasIllegalOffsets) { + offsetAttribute.setOffset(savedStartOffset, savedEndOffset); + } + else { + offsetAttribute.setOffset(startOffset, endOffset); + } + posIncAttribute.setPositionIncrement(position(true)); + typeAttribute.setType(savedType); + accumPosInc = 0; + } + + /** + * Determines if the concatenation is empty + * + * @return {@code true} if the concatenation is empty, {@code false} otherwise + */ + boolean isEmpty() { + return buffer.length() == 0; + } + + /** + * Clears the concatenation and resets its state + */ + void clear() { + buffer.setLength(0); + startOffset = endOffset = type = subwordCount = 0; + } + + /** + * Convenience method for the common scenario of having to write the concetenation and then clearing its state + */ + void writeAndClear() { + write(); + clear(); + } + } + // questions: + // negative numbers? -42 indexed as just 42? + // dollar sign? $42 + // percent sign? 33% + // downsides: if source text is "powershot" then a query of "PowerShot" won't match! +} diff --git a/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java new file mode 100644 index 000000000..4d3abc043 --- /dev/null +++ b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.util.Version; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.IOException; + +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; + +/** + * + * NOTE: I hate duplicating code. But the 'smart' Lucene devs some times leave me no + * other option. Please look at the comment int the AqpWDF for explanation. + * + * Factory for {@link WordDelimiterFilter}. + *

+ * <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
+ *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
+ *             catenateWords="0" catenateNumbers="0" catenateAll="0"
+ *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
+ *             types="wdfftypes.txt" />
+ *   </analyzer>
+ * </fieldType>
+ */ +public class AqpWordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + public static final String TYPES = "types"; + + private final String wordFiles; + private final String types; + private final int flags; + byte[] typeTable = null; + private CharArraySet protectedWords = null; + + /** Creates a new WordDelimiterFilterFactory */ + public AqpWordDelimiterFilterFactory(Map args) { + super(args); + int flags = 0; + if (getInt(args, "generateWordParts", 1) != 0) { + flags |= GENERATE_WORD_PARTS; + } + if (getInt(args, "generateNumberParts", 1) != 0) { + flags |= GENERATE_NUMBER_PARTS; + } + if (getInt(args, "catenateWords", 0) != 0) { + flags |= CATENATE_WORDS; + } + if (getInt(args, "catenateNumbers", 0) != 0) { + flags |= CATENATE_NUMBERS; + } + if (getInt(args, "catenateAll", 0) != 0) { + flags |= CATENATE_ALL; + } + if (getInt(args, "splitOnCaseChange", 1) != 0) { + flags |= SPLIT_ON_CASE_CHANGE; + } + if (getInt(args, "splitOnNumerics", 1) != 0) { + flags |= SPLIT_ON_NUMERICS; + } + if (getInt(args, "preserveOriginal", 0) != 0) { + flags |= PRESERVE_ORIGINAL; + } + if (getInt(args, "stemEnglishPossessive", 1) != 0) { + flags |= STEM_ENGLISH_POSSESSIVE; + } + wordFiles = get(args, PROTECTED_TOKENS); + types = get(args, TYPES); + this.flags = flags; + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public void inform(ResourceLoader loader) throws IOException { + if (wordFiles != null) { + protectedWords = getWordSet(loader, wordFiles, false); + } + if (types != null) { + List files = splitFileNames( types ); + List wlist = new ArrayList<>(); + for( String file : files ){ + List lines = getLines(loader, file.trim()); + wlist.addAll( lines ); + } + typeTable = parseTypes(wlist); + } + } + + @Override + public TokenFilter create(TokenStream input) { + return new AqpWordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, + flags, protectedWords); + } + + // source => type + private static Pattern typePattern = Pattern.compile( "(.*)\\s*=>\\s*(.*)\\s*$" ); + + // parses a list of MappingCharFilter style rules into a custom byte[] type table + private byte[] parseTypes(List rules) { + SortedMap typeMap = new TreeMap<>(); + for( String rule : rules ){ + Matcher m = typePattern.matcher(rule); + if( !m.find() ) + throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]"); + String lhs = parseString(m.group(1).trim()); + Byte rhs = parseType(m.group(2).trim()); + if (lhs.length() != 1) + throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); + if (rhs == null) + throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); + typeMap.put(lhs.charAt(0), rhs); + } + + // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance + byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; + for (int i = 0; i < types.length; i++) + types[i] = WordDelimiterIterator.getType(i); + for (Map.Entry mapping : typeMap.entrySet()) + types[mapping.getKey()] = mapping.getValue(); + return types; + } + + private Byte parseType(String s) { + if (s.equals("LOWER")) + return LOWER; + else if (s.equals("UPPER")) + return UPPER; + else if (s.equals("ALPHA")) + return ALPHA; + else if (s.equals("DIGIT")) + return DIGIT; + else if (s.equals("ALPHANUM")) + return ALPHANUM; + else if (s.equals("SUBWORD_DELIM")) + return SUBWORD_DELIM; + else + return null; + } + + char[] out = new char[256]; + + private String parseString(String s){ + int readPos = 0; + int len = s.length(); + int writePos = 0; + while( readPos < len ){ + char c = s.charAt( readPos++ ); + if( c == '\\' ){ + if( readPos >= len ) + throw new IllegalArgumentException("Invalid escaped char in [" + s + "]"); + c = s.charAt( readPos++ ); + switch( c ) { + case '\\' : c = '\\'; break; + case 'n' : c = '\n'; break; + case 't' : c = '\t'; break; + case 'r' : c = '\r'; break; + case 'b' : c = '\b'; break; + case 'f' : c = '\f'; break; + case 'u' : + if( readPos + 3 >= len ) + throw new IllegalArgumentException("Invalid escaped char in [" + s + "]"); + c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); + readPos += 4; + break; + } + } + out[writePos++] = c; + } + return new String( out, 0, writePos ); + } +} diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java b/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java index caeb8f831..5f23a279b 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java @@ -9,6 +9,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.DateTools; +import org.apache.lucene.document.DateTools.Resolution; import org.apache.solr.util.DateMathParser; public final class DateNormalizerTokenFilter extends TokenFilter { @@ -17,6 +19,7 @@ public final class DateNormalizerTokenFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private DateMathParser dmp; private String offset; + private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS", Locale.ROOT); public DateNormalizerTokenFilter(TokenStream input, String incomingFormat, String offset) { super(input); @@ -61,7 +64,7 @@ private CharSequence normalize(String string) { //else { //date = dmp.parseMath("+5MINUTES"); // 00-00 dates are 1 minute after midnight //} - return f.format(date); + return sdf.format(date); } catch (ParseException e) { //pass } diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java index fb4fd3e9b..d33c1aa30 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java @@ -237,9 +237,9 @@ public void testMultiTokens() throws Exception { //dumpDoc(null, "id", "title"); assertQueryEquals(req("q", "title:\"bubble pace telescope multi-pace foobar\"", "defType", "aqp"), "title:\"bubble (pace syn::lunar) telescope multi (pace syn::lunar) foobar\" " - + "title:\"bubble (pace syn::lunar) telescope multipace ? foobar\" " + + "title:\"bubble (pace syn::lunar) telescope ? multipace foobar\" " + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multi (pace syn::lunar) foobar\"~2 " - + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multipace ? foobar\"~3", + + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? ? multipace foobar\"~3", BooleanQuery.class); assertQ(req("q", "title" + ":\"bubble pace telescope multi-pace foobar\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='17']"); @@ -544,6 +544,7 @@ public void testNoSynChain() throws Exception { public void testSynonyms() throws Exception { + /* * Test multi-token translation, the chain is set to recognize * synonyms. So even if the query string is split into 3 tokens, diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java index 55c601f46..718f439fd 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java @@ -11,25 +11,15 @@ public class TestDateNormalizerFilter extends BaseTokenStreamTestCase { public void test() throws Exception { - ASCIIDuplicatingFilterFactory factory = new ASCIIDuplicatingFilterFactory(new HashMap()); + HashMap config = new HashMap(); + config.put("format", "yyyy-MM-dd|yy-MM-dd|yy-MM"); + DateNormalizerTokenFilterFactory factory = new DateNormalizerTokenFilterFactory(config); - TokenStream stream = factory.create(whitespaceMockTokenizer(new StringReader("čtyřista čtyřicet čtyři"))); - String[] expected = new String[] { "čtyřista", "ctyrista", "čtyřicet", "ctyricet", "čtyři", "ctyri" }; - int[] increments = new int[] {1, 0, 1, 0, 1, 0}; - String W = TypeAttribute.DEFAULT_TYPE; - String D = OnChangeDuplicatingFilter.DUPLICATE; - String[] types = new String[] { W, D, W, D, W, D}; - assertTokenStreamContents(stream, expected, increments); - - stream = factory.create(whitespaceMockTokenizer(new StringReader("čtyřista čtyřicet čtyři"))); - assertTokenStreamContents(stream, expected, types); - - - - // test it doesnt interfere - stream = factory.create(whitespaceMockTokenizer(new StringReader("Cyril Methood"))); + TokenStream stream; + stream = factory.create(whitespaceMockTokenizer(new StringReader("2014-12-00"))); assertTokenStreamContents(stream, - new String[] {"cyril", "methood"}, - new int[] {1, 1}); + new String[] {"2014-12-01"} + + ); } } diff --git a/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java b/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java index 8879a5f03..d77647ae2 100644 --- a/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java +++ b/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java @@ -85,22 +85,22 @@ public void testBooleanQuery() throws Exception { "+(+field:a +field:b)^0.8 -(+field:x +field:y)^0.2"); assertQueryMatch(qp, "(+(-(a b)))^0.8 AND -(x y)^0.2", "field", - "+((+field:a +field:b)^0.8) -((+field:x +field:y)^0.2)"); + "+(+field:a +field:b)^0.8 -(+field:x +field:y)^0.2"); assertQueryMatch(qp, "(+(-(a b)))^0.8 -(x y)", "field", - "+((+field:a +field:b)^0.8) -(+field:x +field:y)"); + "+(+field:a +field:b)^0.8 -(+field:x +field:y)"); // or does -(x y) have different semantics? ... -field:x -field:y // +((-(+field:a +field:b))^0.8) -field:x -field:y assertQueryMatch(qp, "+((+(-(a b)))^0.8)^0.7 OR -(x y)^0.2", "field", - "+((+field:a +field:b)^0.7) -((+field:x +field:y)^0.2)"); + "+(+field:a +field:b)^0.7 -(+field:x +field:y)^0.2"); assertQueryMatch(qp, "+title:(dog cat)", "field", "+title:dog +title:cat"); assertQueryMatch(qp, "title:(+dog -cat)", "field", "+title:dog -title:cat"); - + qp.setAllowLeadingWildcard(true); assertQueryMatch(qp, "\\*", "field", "field:*"); - + qp.setAllowLeadingWildcard(false); assertQueryMatch(qp, "term~", "field", "field:term~2"); assertQueryMatch(qp, "term~1", "field", "field:term~1"); assertQueryMatch(qp, "term~2", "field", "field:term~2"); @@ -139,7 +139,7 @@ public void testBooleanQuery() throws Exception { assertQueryMatch(qp, "-one -two", "field", "-field:one -field:two"); assertQueryMatch(qp, "x:one NOT y:two -three^0.5", "field", - "+(+x:one -y:two) -field:three^0.5"); + "+(+x:one -y:two) -(field:three)^0.5"); qp.setAllowSlowFuzzy(true); assertQueryMatch(qp, "one NOT two -three~0.2", "field", @@ -149,7 +149,7 @@ public void testBooleanQuery() throws Exception { "+field:one -field:two -field:three~0.2"); assertQueryMatch(qp, "one two^0.5 three~0.2", "field", - "+field:one +field:two^0.5 +field:three~0.2"); + "+field:one +(field:two)^0.5 +field:three~0.2"); qp.setAllowSlowFuzzy(false); assertQueryMatch(qp, "one NOT two -three~0.2", "field", @@ -159,7 +159,7 @@ public void testBooleanQuery() throws Exception { "+field:one -field:two -field:three~2"); assertQueryMatch(qp, "one two^0.5 three~0.2", "field", - "+field:one +field:two^0.5 +field:three~2"); + "+field:one +(field:two)^0.5 +field:three~2"); q = qp.parse("one (two three)^0.8", "field"); @@ -213,16 +213,16 @@ public void testBooleanQuery() throws Exception { "+field:this +field:that"); assertQueryMatch(qp, "this (+(that)^0.7)", "field", - "+field:this +field:that^0.7"); + "+field:this +(field:that)^0.7"); assertQueryMatch(qp, "this (+(that thus)^0.7)", "field", - "+field:this +((+field:that +field:thus)^0.7)"); + "+field:this +(+field:that +field:thus)^0.7"); assertQueryMatch(qp, "this (-(+(that thus))^0.7)", "field", - "+field:this -((+field:that +field:thus)^0.7)"); + "+field:this -(+field:that +field:thus)^0.7"); assertQueryMatch(qp, "this (+(-(+(-(that thus))^0.1))^0.3)", "field", - "+field:this +((+field:that +field:thus)^0.3)"); + "+field:this +(+field:that +field:thus)^0.3"); BooleanQuery.setMaxClauseCount(2); try { diff --git a/contrib/examples/adsabs/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/solr/collection1/conf/schema.xml index 59d57fe4b..0785f3eae 100644 --- a/contrib/examples/adsabs/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/solr/collection1/conf/schema.xml @@ -414,7 +414,7 @@ - @@ -423,7 +423,7 @@ - + - + +