From 10f562925a56ad6483624c502e4089c60d2f1b7e Mon Sep 17 00:00:00 2001
From: Roman Chyla <roman.chyla@gmail.com>
Date: Tue, 27 Sep 2016 17:22:11 -0400
Subject: [PATCH] Solved the WDFF problem

---
 .../miscellaneous/AqpWordDelimiterFilter.java | 682 ++++++++++++++++++
 .../AqpWordDelimiterFilterFactory.java        | 204 ++++++
 .../analysis/DateNormalizerTokenFilter.java   |   5 +-
 .../TestAdsabsTypeFulltextParsing.java        |   5 +-
 .../analysis/TestDateNormalizerFilter.java    |  26 +-
 .../flexible/aqp/TestAqpSLGSimple.java        |  24 +-
 .../adsabs/solr/collection1/conf/schema.xml   |   8 +-
 7 files changed, 918 insertions(+), 36 deletions(-)
 create mode 100644 contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java
 create mode 100644 contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java
diff --git a/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java
new file mode 100644
index 000000000..a98abdcd1
--- /dev/null
+++ b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilter.java
@@ -0,0 +1,682 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ 
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.InPlaceMergeSorter;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * 
+ * LAMENT
+ * 
+ * Exact copy of a WDDFilter: OK, I'll say it; that filter was written by someone
+ *  with terrible sleep deprivation. It is sorting buffered tokens based on start
+ *  offsets (instead of end offsets; the previous version of the WDDF had that 
+ *  behaviour). But what is absolutely outrageous is that the author made the 
+ *  'sorter' protected, so I could have changed it without making this stupid clone.
+ *  Had he declared it to be of a generic type. Of course he didn't!
+ *  
+ *  The *only* change I needed to make is marked by string: SILLYPANTIE
+ * 
+ * Splits words into subwords and performs optional transformations on subword
+ * groups. Words are split into subwords with the following rules:
+ * <ul>
+ * <li>split on intra-word delimiters (by default, all non alpha-numeric
+ * characters): <code>"Wi-Fi"</code> &#8594; <code>"Wi", "Fi"</code></li>
+ * <li>split on case transitions: <code>"PowerShot"</code> &#8594;
+ * <code>"Power", "Shot"</code></li>
+ * <li>split on letter-number transitions: <code>"SD500"</code> &#8594;
+ * <code>"SD", "500"</code></li>
+ * <li>leading and trailing intra-word delimiters on each subword are ignored:
+ * <code>"//hello---there, 'dude'"</code> &#8594;
+ * <code>"hello", "there", "dude"</code></li>
+ * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code>
+ * &#8594; <code>"O", "Neil"</code>
+ * <ul>
+ * <li>Note: this step isn't performed in a separate filter because of possible
+ * subword combinations.</li>
+ * </ul>
+ * </li>
+ * </ul>
+ * 
+ * The <b>combinations</b> parameter affects how subwords are combined:
+ * <ul>
+ * <li>combinations="0" causes no subword combinations: <code>"PowerShot"</code>
+ * &#8594; <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token positions)</li>
+ * <li>combinations="1" means that in addition to the subwords, maximum runs of
+ * non-numeric subwords are catenated and produced at the same position of the
+ * last subword in the run:
+ * <ul>
+ * <li><code>"PowerShot"</code> &#8594;
+ * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li>
+ * <li><code>"A's+B's&amp;C's"</code> &gt; <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code>
+ * </li>
+ * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> &#8594;
+ * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code>
+ * </li>
+ * </ul>
+ * </li>
+ * </ul>
+ * One use for {@link WordDelimiterFilter} is to help match words with different
+ * subword delimiters. For example, if the source text contained "wi-fi" one may
+ * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so
+ * is to specify combinations="1" in the analyzer used for indexing, and
+ * combinations="0" (the default) in the analyzer used for querying. Given that
+ * the current {@link StandardTokenizer} immediately removes many intra-word
+ * delimiters, it is recommended that this filter be used after a tokenizer that
+ * does not do this (such as {@link WhitespaceTokenizer}).
+ */
+public final class AqpWordDelimiterFilter extends TokenFilter {
+  
+  public static final int LOWER = 0x01;
+  public static final int UPPER = 0x02;
+  public static final int DIGIT = 0x04;
+  public static final int SUBWORD_DELIM = 0x08;
+
+  // combinations: for testing, not for setting bits
+  public static final int ALPHA = 0x03;
+  public static final int ALPHANUM = 0x07;
+
+  /**
+   * Causes parts of words to be generated:
+   * <p>
+   * "PowerShot" =&gt; "Power" "Shot"
+   */
+  public static final int GENERATE_WORD_PARTS = 1;
+
+  /**
+   * Causes number subwords to be generated:
+   * <p>
+   * "500-42" =&gt; "500" "42"
+   */
+  public static final int GENERATE_NUMBER_PARTS = 2;
+
+  /**
+   * Causes maximum runs of word parts to be catenated:
+   * <p>
+   * "wi-fi" =&gt; "wifi"
+   */
+  public static final int CATENATE_WORDS = 4;
+
+  /**
+   * Causes maximum runs of word parts to be catenated:
+   * <p>
+   * "wi-fi" =&gt; "wifi"
+   */
+  public static final int CATENATE_NUMBERS = 8;
+
+  /**
+   * Causes all subword parts to be catenated:
+   * <p>
+   * "wi-fi-4000" =&gt; "wifi4000"
+   */
+  public static final int CATENATE_ALL = 16;
+
+  /**
+   * Causes original words are preserved and added to the subword list (Defaults to false)
+   * <p>
+   * "500-42" =&gt; "500" "42" "500-42"
+   */
+  public static final int PRESERVE_ORIGINAL = 32;
+
+  /**
+   * If not set, causes case changes to be ignored (subwords will only be generated
+   * given SUBWORD_DELIM tokens)
+   */
+  public static final int SPLIT_ON_CASE_CHANGE = 64;
+
+  /**
+   * If not set, causes numeric changes to be ignored (subwords will only be generated
+   * given SUBWORD_DELIM tokens).
+   */
+  public static final int SPLIT_ON_NUMERICS = 128;
+
+  /**
+   * Causes trailing "'s" to be removed for each subword
+   * <p>
+   * "O'Neil's" =&gt; "O", "Neil"
+   */
+  public static final int STEM_ENGLISH_POSSESSIVE = 256;
+  
+  /**
+   * If not null is the set of tokens to protect from being delimited
+   *
+   */
+  final CharArraySet protWords;
+
+  private final int flags;
+    
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+
+  // used for iterating word delimiter breaks
+  private final WordDelimiterIterator iterator;
+
+  // used for concatenating runs of similar typed subwords (word,number)
+  private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+  // number of subwords last output by concat.
+  private int lastConcatCount = 0;
+
+  // used for catenate all
+  private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
+
+  // used for accumulating position increment gaps
+  private int accumPosInc = 0;
+
+  private char savedBuffer[] = new char[1024];
+  private int savedStartOffset;
+  private int savedEndOffset;
+  private String savedType;
+  private boolean hasSavedState = false;
+  // if length by start + end offsets doesn't match the term text then assume
+  // this is a synonym and don't adjust the offsets.
+  private boolean hasIllegalOffsets = false;
+
+  // for a run of the same subword type within a word, have we output anything?
+  private boolean hasOutputToken = false;
+  // when preserve original is on, have we output any token following it?
+  // this token must have posInc=0!
+  private boolean hasOutputFollowingOriginal = false;
+
+  /**
+   * Creates a new WordDelimiterFilter
+   *
+   * @param in TokenStream to be filtered
+   * @param charTypeTable table containing character types
+   * @param configurationFlags Flags configuring the filter
+   * @param protWords If not null is the set of tokens to protect from being delimited
+   */
+  public AqpWordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+    super(in);
+    this.flags = configurationFlags;
+    this.protWords = protWords;
+    this.iterator = new WordDelimiterIterator(
+        charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
+  }
+
+  /**
+   * Creates a new WordDelimiterFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE}
+   * as its charTypeTable
+   *
+   * @param in TokenStream to be filtered
+   * @param configurationFlags Flags configuring the filter
+   * @param protWords If not null is the set of tokens to protect from being delimited
+   */
+  public AqpWordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    while (true) {
+      if (!hasSavedState) {
+        // process a new input word
+        if (!input.incrementToken()) {
+          return false;
+        }
+
+        int termLength = termAttribute.length();
+        char[] termBuffer = termAttribute.buffer();
+        
+        accumPosInc += posIncAttribute.getPositionIncrement();
+
+        iterator.setText(termBuffer, termLength);
+        iterator.next();
+
+        // word of no delimiters, or protected word: just return it
+        if ((iterator.current == 0 && iterator.end == termLength) ||
+            (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          first = false;
+          return true;
+        }
+        
+        // word of simply delimiters
+        if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) {
+          // if the posInc is 1, simply ignore it in the accumulation
+          // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous logic!
+          if (posIncAttribute.getPositionIncrement() == 1 && !first) {
+            accumPosInc--;
+          }
+          continue;
+        }
+
+        saveState();
+
+        hasOutputToken = false;
+        hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
+        lastConcatCount = 0;
+        
+        if (has(PRESERVE_ORIGINAL)) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          first = false;
+          return true;
+        }
+      }
+      
+      // at the end of the string, output any concatenations
+      if (iterator.end == WordDelimiterIterator.DONE) {
+        if (!concat.isEmpty()) {
+          if (flushConcatenation(concat)) {
+            buffer();
+            continue;
+          }
+        }
+        
+        if (!concatAll.isEmpty()) {
+          // only if we haven't output this same combo above!
+          if (concatAll.subwordCount > lastConcatCount) {
+            concatAll.writeAndClear();
+            buffer();
+            continue;
+          }
+          concatAll.clear();
+        }
+        
+        if (bufferedPos < bufferedLen) {
+          if (bufferedPos == 0) {
+            // sorter.sort(0, bufferedLen); SILLYPANTIE
+          }
+          clearAttributes();
+          restoreState(buffered[bufferedPos++]);
+          if (first && posIncAttribute.getPositionIncrement() == 0) {
+            // can easily happen with strange combinations (e.g. not outputting numbers, but concat-all)
+            posIncAttribute.setPositionIncrement(1);
+          }
+          first = false;
+          return true;
+        }
+        
+        // no saved concatenations, on to the next input word
+        bufferedPos = bufferedLen = 0;
+        hasSavedState = false;
+        continue;
+      }
+      
+      // word surrounded by delimiters: always output
+      if (iterator.isSingleWord()) {
+        generatePart(true);
+        iterator.next();
+        first = false;
+        return true;
+      }
+      
+      int wordType = iterator.type();
+      
+      // do we already have queued up incompatible concatenations?
+      if (!concat.isEmpty() && (concat.type & wordType) == 0) {
+        if (flushConcatenation(concat)) {
+          hasOutputToken = false;
+          buffer();
+          continue;
+        }
+        hasOutputToken = false;
+      }
+      
+      // add subwords depending upon options
+      if (shouldConcatenate(wordType)) {
+        if (concat.isEmpty()) {
+          concat.type = wordType;
+        }
+        concatenate(concat);
+      }
+      
+      // add all subwords (catenateAll)
+      if (has(CATENATE_ALL)) {
+        concatenate(concatAll);
+      }
+      
+      // if we should output the word or number part
+      if (shouldGenerateParts(wordType)) {
+        generatePart(false);
+        buffer();
+      }
+        
+      iterator.next();
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    hasSavedState = false;
+    concat.clear();
+    concatAll.clear();
+    accumPosInc = bufferedPos = bufferedLen = 0;
+    first = true;
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  
+  private AttributeSource.State buffered[] = new AttributeSource.State[8];
+  private int startOff[] = new int[8];
+  private int posInc[] = new int[8];
+  private int bufferedLen = 0;
+  private int bufferedPos = 0;
+  private boolean first;
+  
+  private class OffsetSorter extends InPlaceMergeSorter {
+    @Override
+    protected int compare(int i, int j) {
+      int cmp = Integer.compare(startOff[i], startOff[j]);
+      if (cmp == 0) {
+        cmp = Integer.compare(posInc[j], posInc[i]);
+      }
+      return cmp;
+    }
+
+    @Override
+    protected void swap(int i, int j) {
+      AttributeSource.State tmp = buffered[i];
+      buffered[i] = buffered[j];
+      buffered[j] = tmp;
+      
+      int tmp2 = startOff[i];
+      startOff[i] = startOff[j];
+      startOff[j] = tmp2;
+      
+      tmp2 = posInc[i];
+      posInc[i] = posInc[j];
+      posInc[j] = tmp2;
+    }
+  }
+  
+  final OffsetSorter sorter = new OffsetSorter();
+  
+  private void buffer() {
+    if (bufferedLen == buffered.length) {
+      int newSize = ArrayUtil.oversize(bufferedLen+1, 8);
+      buffered = Arrays.copyOf(buffered, newSize);
+      startOff = Arrays.copyOf(startOff, newSize);
+      posInc = Arrays.copyOf(posInc, newSize);
+    }
+    startOff[bufferedLen] = offsetAttribute.startOffset();
+    posInc[bufferedLen] = posIncAttribute.getPositionIncrement();
+    buffered[bufferedLen] = captureState();
+    bufferedLen++;
+  }
+  
+  /**
+   * Saves the existing attribute states
+   */
+  private void saveState() {
+    // otherwise, we have delimiters, save state
+    savedStartOffset = offsetAttribute.startOffset();
+    savedEndOffset = offsetAttribute.endOffset();
+    // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
+    hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
+    savedType = typeAttribute.type();
+
+    if (savedBuffer.length < termAttribute.length()) {
+      savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), Character.BYTES)];
+    }
+
+    System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
+    iterator.text = savedBuffer;
+
+    hasSavedState = true;
+  }
+
+  /**
+   * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
+   *
+   * @param concatenation WordDelimiterConcatenation that will be flushed
+   * @return {@code true} if the concatenation was written before it was cleared, {@code false} otherwise
+   */
+  private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
+    lastConcatCount = concatenation.subwordCount;
+    if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
+      concatenation.writeAndClear();
+      return true;
+    }
+    concatenation.clear();
+    return false;
+  }
+
+  /**
+   * Determines whether to concatenate a word or number if the current word is the given type
+   *
+   * @param wordType Type of the current word used to determine if it should be concatenated
+   * @return {@code true} if concatenation should occur, {@code false} otherwise
+   */
+  private boolean shouldConcatenate(int wordType) {
+    return (has(CATENATE_WORDS) && isAlpha(wordType)) || (has(CATENATE_NUMBERS) && isDigit(wordType));
+  }
+
+  /**
+   * Determines whether a word/number part should be generated for a word of the given type
+   *
+   * @param wordType Type of the word used to determine if a word/number part should be generated
+   * @return {@code true} if a word/number part should be generated, {@code false} otherwise
+   */
+  private boolean shouldGenerateParts(int wordType) {
+    return (has(GENERATE_WORD_PARTS) && isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && isDigit(wordType));
+  }
+
+  /**
+   * Concatenates the saved buffer to the given WordDelimiterConcatenation
+   *
+   * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
+   */
+  private void concatenate(WordDelimiterConcatenation concatenation) {
+    if (concatenation.isEmpty()) {
+      concatenation.startOffset = savedStartOffset + iterator.current;
+    }
+    concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
+    concatenation.endOffset = savedStartOffset + iterator.end;
+  }
+
+  /**
+   * Generates a word/number part, updating the appropriate attributes
+   *
+   * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
+   */
+  private void generatePart(boolean isSingleWord) {
+    clearAttributes();
+    termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
+
+    int startOffset = savedStartOffset + iterator.current;
+    int endOffset = savedStartOffset + iterator.end;
+    
+    if (hasIllegalOffsets) {
+      // historically this filter did this regardless for 'isSingleWord', 
+      // but we must do a sanity check:
+      if (isSingleWord && startOffset <= savedEndOffset) {
+        offsetAttribute.setOffset(startOffset, savedEndOffset);
+      } else {
+        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+      }
+    } else {
+      offsetAttribute.setOffset(startOffset, endOffset);
+    }
+    posIncAttribute.setPositionIncrement(position(false));
+    typeAttribute.setType(savedType);
+  }
+
+  /**
+   * Get the position increment gap for a subword or concatenation
+   *
+   * @param inject true if this token wants to be injected
+   * @return position increment gap
+   */
+  private int position(boolean inject) {
+    int posInc = accumPosInc;
+
+    if (hasOutputToken) {
+      accumPosInc = 0;
+      return inject ? 0 : Math.max(1, posInc);
+    }
+
+    hasOutputToken = true;
+    
+    if (!hasOutputFollowingOriginal) {
+      // the first token following the original is 0 regardless
+      hasOutputFollowingOriginal = true;
+      return 0;
+    }
+    // clear the accumulated position increment
+    accumPosInc = 0;
+    return Math.max(1, posInc);
+  }
+
+  /**
+   * Checks if the given word type includes {@link #ALPHA}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains ALPHA, {@code false} otherwise
+   */
+  static boolean isAlpha(int type) {
+    return (type & ALPHA) != 0;
+  }
+
+  /**
+   * Checks if the given word type includes {@link #DIGIT}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains DIGIT, {@code false} otherwise
+   */
+  static boolean isDigit(int type) {
+    return (type & DIGIT) != 0;
+  }
+
+  /**
+   * Checks if the given word type includes {@link #SUBWORD_DELIM}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
+   */
+  static boolean isSubwordDelim(int type) {
+    return (type & SUBWORD_DELIM) != 0;
+  }
+
+  /**
+   * Checks if the given word type includes {@link #UPPER}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains UPPER, {@code false} otherwise
+   */
+  static boolean isUpper(int type) {
+    return (type & UPPER) != 0;
+  }
+
+  /**
+   * Determines whether the given flag is set
+   *
+   * @param flag Flag to see if set
+   * @return {@code true} if flag is set
+   */
+  private boolean has(int flag) {
+    return (flags & flag) != 0;
+  }
+
+  // ================================================= Inner Classes =================================================
+
+  /**
+   * A WDF concatenated 'run'
+   */
+  final class WordDelimiterConcatenation {
+    final StringBuilder buffer = new StringBuilder();
+    int startOffset;
+    int endOffset;
+    int type;
+    int subwordCount;
+
+    /**
+     * Appends the given text of the given length, to the concetenation at the given offset
+     *
+     * @param text Text to append
+     * @param offset Offset in the concetenation to add the text
+     * @param length Length of the text to append
+     */
+    void append(char text[], int offset, int length) {
+      buffer.append(text, offset, length);
+      subwordCount++;
+    }
+
+    /**
+     * Writes the concatenation to the attributes
+     */
+    void write() {
+      clearAttributes();
+      if (termAttribute.length() < buffer.length()) {
+        termAttribute.resizeBuffer(buffer.length());
+      }
+      char termbuffer[] = termAttribute.buffer();
+      
+      buffer.getChars(0, buffer.length(), termbuffer, 0);
+      termAttribute.setLength(buffer.length());
+        
+      if (hasIllegalOffsets) {
+        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+      }
+      else {
+        offsetAttribute.setOffset(startOffset, endOffset);
+      }
+      posIncAttribute.setPositionIncrement(position(true));
+      typeAttribute.setType(savedType);
+      accumPosInc = 0;
+    }
+
+    /**
+     * Determines if the concatenation is empty
+     *
+     * @return {@code true} if the concatenation is empty, {@code false} otherwise
+     */
+    boolean isEmpty() {
+      return buffer.length() == 0;
+    }
+
+    /**
+     * Clears the concatenation and resets its state
+     */
+    void clear() {
+      buffer.setLength(0);
+      startOffset = endOffset = type = subwordCount = 0;
+    }
+
+    /**
+     * Convenience method for the common scenario of having to write the concetenation and then clearing its state
+     */
+    void writeAndClear() {
+      write();
+      clear();
+    }
+  }
+  // questions:
+  // negative numbers?  -42 indexed as just 42?
+  // dollar sign?  $42
+  // percent sign?  33%
+  // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
+}
diff --git a/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java
new file mode 100644
index 000000000..4d3abc043
--- /dev/null
+++ b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AqpWordDelimiterFilterFactory.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.Version;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.io.IOException;
+
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
+
+/**
+ * 
+ * NOTE: I hate duplicating code. But the 'smart' Lucene devs some times leave me no 
+ * other option. Please look at the comment int the AqpWDF for explanation.
+ * 
+ * Factory for {@link WordDelimiterFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
+ *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
+ *             catenateWords="0" catenateNumbers="0" catenateAll="0"
+ *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
+ *             types="wdfftypes.txt" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class AqpWordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+  public static final String PROTECTED_TOKENS = "protected";
+  public static final String TYPES = "types";
+
+  private final String wordFiles;
+  private final String types;
+  private final int flags;
+  byte[] typeTable = null;
+  private CharArraySet protectedWords = null;
+  
+  /** Creates a new WordDelimiterFilterFactory */
+  public AqpWordDelimiterFilterFactory(Map<String, String> args) {
+    super(args);
+    int flags = 0;
+    if (getInt(args, "generateWordParts", 1) != 0) {
+      flags |= GENERATE_WORD_PARTS;
+    }
+    if (getInt(args, "generateNumberParts", 1) != 0) {
+      flags |= GENERATE_NUMBER_PARTS;
+    }
+    if (getInt(args, "catenateWords", 0) != 0) {
+      flags |= CATENATE_WORDS;
+    }
+    if (getInt(args, "catenateNumbers", 0) != 0) {
+      flags |= CATENATE_NUMBERS;
+    }
+    if (getInt(args, "catenateAll", 0) != 0) {
+      flags |= CATENATE_ALL;
+    }
+    if (getInt(args, "splitOnCaseChange", 1) != 0) {
+      flags |= SPLIT_ON_CASE_CHANGE;
+    }
+    if (getInt(args, "splitOnNumerics", 1) != 0) {
+      flags |= SPLIT_ON_NUMERICS;
+    }
+    if (getInt(args, "preserveOriginal", 0) != 0) {
+      flags |= PRESERVE_ORIGINAL;
+    }
+    if (getInt(args, "stemEnglishPossessive", 1) != 0) {
+      flags |= STEM_ENGLISH_POSSESSIVE;
+    }
+    wordFiles = get(args, PROTECTED_TOKENS);
+    types = get(args, TYPES);
+    this.flags = flags;
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public void inform(ResourceLoader loader) throws IOException {
+    if (wordFiles != null) {  
+      protectedWords = getWordSet(loader, wordFiles, false);
+    }
+    if (types != null) {
+      List<String> files = splitFileNames( types );
+      List<String> wlist = new ArrayList<>();
+      for( String file : files ){
+        List<String> lines = getLines(loader, file.trim());
+        wlist.addAll( lines );
+      }
+      typeTable = parseTypes(wlist);
+    }
+  }
+
+  @Override
+  public TokenFilter create(TokenStream input) {
+    return new AqpWordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+                                   flags, protectedWords);
+  }
+  
+  // source => type
+  private static Pattern typePattern = Pattern.compile( "(.*)\\s*=>\\s*(.*)\\s*$" );
+  
+  // parses a list of MappingCharFilter style rules into a custom byte[] type table
+  private byte[] parseTypes(List<String> rules) {
+    SortedMap<Character,Byte> typeMap = new TreeMap<>();
+    for( String rule : rules ){
+      Matcher m = typePattern.matcher(rule);
+      if( !m.find() )
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]");
+      String lhs = parseString(m.group(1).trim());
+      Byte rhs = parseType(m.group(2).trim());
+      if (lhs.length() != 1)
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
+      if (rhs == null)
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
+      typeMap.put(lhs.charAt(0), rhs);
+    }
+    
+    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
+    byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
+    for (int i = 0; i < types.length; i++)
+      types[i] = WordDelimiterIterator.getType(i);
+    for (Map.Entry<Character,Byte> mapping : typeMap.entrySet())
+      types[mapping.getKey()] = mapping.getValue();
+    return types;
+  }
+  
+  private Byte parseType(String s) {
+    if (s.equals("LOWER"))
+      return LOWER;
+    else if (s.equals("UPPER"))
+      return UPPER;
+    else if (s.equals("ALPHA"))
+      return ALPHA;
+    else if (s.equals("DIGIT"))
+      return DIGIT;
+    else if (s.equals("ALPHANUM"))
+      return ALPHANUM;
+    else if (s.equals("SUBWORD_DELIM"))
+      return SUBWORD_DELIM;
+    else
+      return null;
+  }
+  
+  char[] out = new char[256];
+  
+  private String parseString(String s){
+    int readPos = 0;
+    int len = s.length();
+    int writePos = 0;
+    while( readPos < len ){
+      char c = s.charAt( readPos++ );
+      if( c == '\\' ){
+        if( readPos >= len )
+          throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
+        c = s.charAt( readPos++ );
+        switch( c ) {
+          case '\\' : c = '\\'; break;
+          case 'n' : c = '\n'; break;
+          case 't' : c = '\t'; break;
+          case 'r' : c = '\r'; break;
+          case 'b' : c = '\b'; break;
+          case 'f' : c = '\f'; break;
+          case 'u' :
+            if( readPos + 3 >= len )
+              throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
+            c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+            readPos += 4;
+            break;
+        }
+      }
+      out[writePos++] = c;
+    }
+    return new String( out, 0, writePos );
+  }
+}
diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java b/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java
index caeb8f831..5f23a279b 100644
--- a/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java
+++ b/contrib/adsabs/src/java/org/apache/solr/analysis/DateNormalizerTokenFilter.java
@@ -9,6 +9,8 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.DateTools.Resolution;
 import org.apache.solr.util.DateMathParser;
 
 public final class DateNormalizerTokenFilter extends TokenFilter {
@@ -17,6 +19,7 @@ public final class DateNormalizerTokenFilter extends TokenFilter {
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private DateMathParser dmp;
   private String offset;
+  private SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS", Locale.ROOT);
   
   public DateNormalizerTokenFilter(TokenStream input, String incomingFormat, String offset) {
     super(input);
@@ -61,7 +64,7 @@ private CharSequence normalize(String string) {
           //else {
             //date = dmp.parseMath("+5MINUTES"); // 00-00 dates are 1 minute after midnight
           //}
-          return f.format(date);
+          return sdf.format(date);
       } catch (ParseException e) {
         //pass
       }
diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java
index fb4fd3e9b..d33c1aa30 100644
--- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java
+++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java
@@ -237,9 +237,9 @@ public void testMultiTokens() throws Exception {
     //dumpDoc(null, "id", "title");
     assertQueryEquals(req("q", "title:\"bubble pace telescope multi-pace foobar\"", "defType", "aqp"), 
         "title:\"bubble (pace syn::lunar) telescope multi (pace syn::lunar) foobar\" "
-        + "title:\"bubble (pace syn::lunar) telescope multipace ? foobar\" "
+        + "title:\"bubble (pace syn::lunar) telescope ? multipace foobar\" "
         + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multi (pace syn::lunar) foobar\"~2 "
-        + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multipace ? foobar\"~3",
+        + "title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? ? multipace foobar\"~3",
         BooleanQuery.class);
     assertQ(req("q", "title" + ":\"bubble pace telescope multi-pace foobar\""), "//*[@numFound='1']",
         "//doc/str[@name='id'][.='17']");
@@ -544,6 +544,7 @@ public void testNoSynChain() throws Exception {
   
   public void testSynonyms() throws Exception {
     
+    
     /*
      * Test multi-token translation, the chain is set to recognize
      * synonyms. So even if the query string is split into 3 tokens,
diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java
index 55c601f46..718f439fd 100644
--- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java
+++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestDateNormalizerFilter.java
@@ -11,25 +11,15 @@
 public class TestDateNormalizerFilter extends BaseTokenStreamTestCase {
 	
 	public void test() throws Exception {
-		ASCIIDuplicatingFilterFactory factory = new ASCIIDuplicatingFilterFactory(new HashMap<String,String>());
+	  HashMap<String, String> config = new HashMap<String, String>();
+	  config.put("format", "yyyy-MM-dd|yy-MM-dd|yy-MM");
+		DateNormalizerTokenFilterFactory factory = new DateNormalizerTokenFilterFactory(config);
 		
-		TokenStream stream = factory.create(whitespaceMockTokenizer(new StringReader("čtyřista čtyřicet čtyři")));
-		String[] expected = new String[] { "čtyřista", "ctyrista", "čtyřicet", "ctyricet", "čtyři", "ctyri" };
-		int[] increments = new int[] {1, 0, 1, 0, 1, 0};
-		String W = TypeAttribute.DEFAULT_TYPE;
-		String D = OnChangeDuplicatingFilter.DUPLICATE;
-		String[] types = new String[] { W, D, W, D, W, D};
-		assertTokenStreamContents(stream, expected, increments);
-		
-		stream = factory.create(whitespaceMockTokenizer(new StringReader("čtyřista čtyřicet čtyři")));
-		assertTokenStreamContents(stream, expected, types);
-		
-		
-		
-		// test it doesnt interfere
-		stream = factory.create(whitespaceMockTokenizer(new StringReader("Cyril Methood")));
+		TokenStream stream;
+		stream = factory.create(whitespaceMockTokenizer(new StringReader("2014-12-00")));
     assertTokenStreamContents(stream, 
-        new String[] {"cyril", "methood"}, 
-        new int[] {1, 1});
+        new String[] {"2014-12-01"} 
+        
+    );
 	}
 }
diff --git a/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java b/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java
index 8879a5f03..d77647ae2 100644
--- a/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java
+++ b/contrib/antlrqueryparser/src/test/org/apache/lucene/queryparser/flexible/aqp/TestAqpSLGSimple.java
@@ -85,22 +85,22 @@ public void testBooleanQuery() throws Exception {
         "+(+field:a +field:b)^0.8 -(+field:x +field:y)^0.2");
 
     assertQueryMatch(qp, "(+(-(a b)))^0.8 AND -(x y)^0.2", "field",
-        "+((+field:a +field:b)^0.8) -((+field:x +field:y)^0.2)");
+        "+(+field:a +field:b)^0.8 -(+field:x +field:y)^0.2");
 
     assertQueryMatch(qp, "(+(-(a b)))^0.8 -(x y)", "field",
-        "+((+field:a +field:b)^0.8) -(+field:x +field:y)");
+        "+(+field:a +field:b)^0.8 -(+field:x +field:y)");
     // or does -(x y) have different semantics? ... -field:x -field:y
     // +((-(+field:a +field:b))^0.8) -field:x -field:y
 
     assertQueryMatch(qp, "+((+(-(a b)))^0.8)^0.7 OR -(x y)^0.2", "field",
-        "+((+field:a +field:b)^0.7) -((+field:x +field:y)^0.2)");
+        "+(+field:a +field:b)^0.7 -(+field:x +field:y)^0.2");
 
     assertQueryMatch(qp, "+title:(dog cat)", "field", "+title:dog +title:cat");
 
     assertQueryMatch(qp, "title:(+dog -cat)", "field", "+title:dog -title:cat");
-
+    qp.setAllowLeadingWildcard(true);
     assertQueryMatch(qp, "\\*", "field", "field:*");
-
+    qp.setAllowLeadingWildcard(false);
     assertQueryMatch(qp, "term~", "field", "field:term~2");
     assertQueryMatch(qp, "term~1", "field", "field:term~1");
     assertQueryMatch(qp, "term~2", "field", "field:term~2");
@@ -139,7 +139,7 @@ public void testBooleanQuery() throws Exception {
     assertQueryMatch(qp, "-one -two", "field", "-field:one -field:two");
 
     assertQueryMatch(qp, "x:one NOT y:two -three^0.5", "field",
-        "+(+x:one -y:two) -field:three^0.5");
+        "+(+x:one -y:two) -(field:three)^0.5");
 
     qp.setAllowSlowFuzzy(true);
     assertQueryMatch(qp, "one NOT two -three~0.2", "field",
@@ -149,7 +149,7 @@ public void testBooleanQuery() throws Exception {
         "+field:one -field:two -field:three~0.2");
 
     assertQueryMatch(qp, "one two^0.5 three~0.2", "field",
-        "+field:one +field:two^0.5 +field:three~0.2");
+        "+field:one +(field:two)^0.5 +field:three~0.2");
     qp.setAllowSlowFuzzy(false);
 
     assertQueryMatch(qp, "one NOT two -three~0.2", "field",
@@ -159,7 +159,7 @@ public void testBooleanQuery() throws Exception {
         "+field:one -field:two -field:three~2");
 
     assertQueryMatch(qp, "one two^0.5 three~0.2", "field",
-        "+field:one +field:two^0.5 +field:three~2");
+        "+field:one +(field:two)^0.5 +field:three~2");
 
     q = qp.parse("one (two three)^0.8", "field");
     
@@ -213,16 +213,16 @@ public void testBooleanQuery() throws Exception {
         "+field:this +field:that");
 
     assertQueryMatch(qp, "this (+(that)^0.7)", "field",
-        "+field:this +field:that^0.7");
+        "+field:this +(field:that)^0.7");
 
     assertQueryMatch(qp, "this (+(that thus)^0.7)", "field",
-        "+field:this +((+field:that +field:thus)^0.7)");
+        "+field:this +(+field:that +field:thus)^0.7");
 
     assertQueryMatch(qp, "this (-(+(that thus))^0.7)", "field",
-        "+field:this -((+field:that +field:thus)^0.7)");
+        "+field:this -(+field:that +field:thus)^0.7");
 
     assertQueryMatch(qp, "this (+(-(+(-(that thus))^0.1))^0.3)", "field",
-        "+field:this +((+field:that +field:thus)^0.3)");
+        "+field:this +(+field:that +field:thus)^0.3");
 
     BooleanQuery.setMaxClauseCount(2);
     try {
diff --git a/contrib/examples/adsabs/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/solr/collection1/conf/schema.xml
index 59d57fe4b..0785f3eae 100644
--- a/contrib/examples/adsabs/solr/collection1/conf/schema.xml
+++ b/contrib/examples/adsabs/solr/collection1/conf/schema.xml
@@ -414,7 +414,7 @@
 			    <filter class="org.apache.lucene.analysis.miscellaneous.AdsSpecialCharactersFilterFactory" />
 
 				<!-- split all-sky into [all, sky, allsky] -->
-				<filter class="solr.WordDelimiterFilterFactory"
+				<filter class="org.apache.lucene.analysis.miscellaneous.AqpWordDelimiterFilterFactory"
 					generateWordParts="1" generateNumberParts="1" catenateWords="0"
 					catenateNumbers="0" catenateAll="1" splitOnCaseChange="0"
 					splitOnNumerics="0" stemEnglishPossessive="1" preserveOriginal="0" />
@@ -423,7 +423,7 @@
 				<filter
 					class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
 
-
+                
 				<!-- find synonyms, first multi-tokens -->
 				<filter class="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory"
 					synonyms="ads_text_multi.synonyms" ignoreCase="false" expand="true"
@@ -494,12 +494,14 @@
                 <filter class="org.apache.lucene.analysis.miscellaneous.AdsSpecialCharactersFilterFactory" />
 
 
-				<filter class="solr.WordDelimiterFilterFactory"
+				<filter class="org.apache.lucene.analysis.miscellaneous.AqpWordDelimiterFilterFactory"
 					generateWordParts="1" generateNumberParts="1" catenateWords="1"
 					catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"
 					splitOnNumerics="0" stemEnglishPossessive="1" preserveOriginal="0"
 					 />
                 
+                <!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="split"/> -->
+                
 				<!-- lowercase words, but keep ACRONYMS case ie. MOND => MOND Mond =>
 					mond Hubble Space Telescope => hubble space telescope -->
 				<filter