forked from adsabs/montysolr
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
25 changed files
with
1,073 additions
and
453 deletions.
There are no files selected for viewing
288 changes: 288 additions & 0 deletions
288
.../adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,288 @@ | ||
package org.apache.lucene.analysis.miscellaneous; | ||
|
||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
import java.io.IOException; | ||
|
||
import org.apache.lucene.analysis.TokenFilter; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.util.ArrayUtil; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
|
||
/** | ||
* | ||
* This is a modified ASCIIFoldingFilter | ||
* | ||
* It translates greek math symbols | ||
* | ||
* For example, 'γ' will be replaced by 'gamma'. | ||
*/ | ||
public final class AdsSpecialCharactersFilter extends TokenFilter { | ||
public AdsSpecialCharactersFilter(TokenStream input) | ||
{ | ||
super(input); | ||
} | ||
|
||
private char[] output = new char[512]; | ||
private int outputPos; | ||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
|
||
@Override | ||
public boolean incrementToken() throws IOException { | ||
if (input.incrementToken()) { | ||
final char[] buffer = termAtt.buffer(); | ||
final int length = termAtt.length(); | ||
|
||
// If no characters actually require rewriting then we | ||
// just return token as-is: | ||
for(int i = 0 ; i < length ; ++i) { | ||
final char c = buffer[i]; | ||
if (c >= '\u0080') | ||
{ | ||
foldToASCII(buffer, length); | ||
termAtt.copyBuffer(output, 0, outputPos); | ||
break; | ||
} | ||
} | ||
return true; | ||
} else { | ||
return false; | ||
} | ||
} | ||
|
||
/** | ||
* Converts characters above ASCII to their ASCII equivalents. For example, | ||
* accents are removed from accented characters. | ||
* @param input The string to fold | ||
* @param length The number of characters in the input string | ||
*/ | ||
public void foldToASCII(char[] input, int length) | ||
{ | ||
// Worst-case length required: | ||
final int maxSizeNeeded = 4 * length; | ||
if (output.length < maxSizeNeeded) { | ||
output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)]; | ||
} | ||
|
||
outputPos = foldToASCII(input, 0, output, 0, length); | ||
} | ||
|
||
/** | ||
* Converts characters above ASCII to their ASCII equivalents. For example, | ||
* accents are removed from accented characters. | ||
* @param input The characters to fold | ||
* @param inputPos Index of the first character to fold | ||
* @param output The result of the folding. Should be of size >= {@code length * 4}. | ||
* @param outputPos Index of output where to put the result of the folding | ||
* @param length The number of characters to fold | ||
* @return length of output | ||
* @lucene.internal | ||
*/ | ||
public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length) | ||
{ | ||
final int end = inputPos + length; | ||
for (int pos = inputPos; pos < end ; ++pos) { | ||
final char c = input[pos]; | ||
|
||
// Quick test: if it's not in range then just keep current character | ||
if (c < '\u0080') { | ||
output[outputPos++] = c; | ||
} else { | ||
switch (c) { | ||
case '\u0391': | ||
case '\u03B1': | ||
output[outputPos++] = 'a'; | ||
output[outputPos++] = 'l'; | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 'h'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0392': | ||
case '\u03B2': | ||
output[outputPos++] = 'b'; | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0393': | ||
case '\u03B3': | ||
output[outputPos++] = 'g'; | ||
output[outputPos++] = 'a'; | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0394': | ||
case '\u03B4': | ||
output[outputPos++] = 'd'; | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 'l'; | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0395': | ||
case '\u03B5': | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 's'; | ||
output[outputPos++] = 'i'; | ||
output[outputPos++] = 'l'; | ||
output[outputPos++] = 'o'; | ||
output[outputPos++] = 'n'; | ||
break; | ||
case '\u0396': | ||
case '\u03B6': | ||
output[outputPos++] = 'z'; | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0397': | ||
case '\u03B7': | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0398': | ||
case '\u03B8': | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'h'; | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u0399': | ||
case '\u03B9': | ||
output[outputPos++] = 'i'; | ||
output[outputPos++] = 'o'; | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u039A': | ||
case '\u03BA': | ||
output[outputPos++] = 'k'; | ||
output[outputPos++] = 'a'; | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u039B': | ||
case '\u03BB': | ||
output[outputPos++] = 'l'; | ||
output[outputPos++] = 'a'; | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'b'; | ||
output[outputPos++] = 'd'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u039C': | ||
case '\u03BC': | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'u'; | ||
break; | ||
case '\u039D': | ||
case '\u03BD': | ||
output[outputPos++] = 'n'; | ||
output[outputPos++] = 'u'; | ||
break; | ||
case '\u039E': | ||
case '\u03BE': | ||
output[outputPos++] = 'x'; | ||
output[outputPos++] = 'i'; | ||
break; | ||
case '\u039F': | ||
case '\u03BF': | ||
output[outputPos++] = 'o'; | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'i'; | ||
output[outputPos++] = 'c'; | ||
output[outputPos++] = 'r'; | ||
output[outputPos++] = 'o'; | ||
output[outputPos++] = 'n'; | ||
break; | ||
case '\u03A0': | ||
case '\u03C0': | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 'i'; | ||
break; | ||
case '\u03A1': | ||
case '\u03C1': | ||
output[outputPos++] = 'r'; | ||
output[outputPos++] = 'h'; | ||
output[outputPos++] = 'o'; | ||
break; | ||
case '\u03A3': | ||
case '\u03C3': | ||
output[outputPos++] = 's'; | ||
output[outputPos++] = 'i'; | ||
output[outputPos++] = 'g'; | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
case '\u03A4': | ||
case '\u03C4': | ||
output[outputPos++] = 't'; | ||
output[outputPos++] = 'a'; | ||
output[outputPos++] = 'u'; | ||
break; | ||
case '\u03A5': | ||
case '\u03C5': | ||
output[outputPos++] = 'u'; | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 's'; | ||
output[outputPos++] = 'i'; | ||
output[outputPos++] = 'l'; | ||
output[outputPos++] = 'o'; | ||
output[outputPos++] = 'n'; | ||
break; | ||
case '\u03A6': | ||
case '\u03C6': | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 'h'; | ||
output[outputPos++] = 'i'; | ||
break; | ||
case '\u03A7': | ||
case '\u03C7': | ||
output[outputPos++] = 'c'; | ||
output[outputPos++] = 'h'; | ||
output[outputPos++] = 'i'; | ||
break; | ||
case '\u03A8': | ||
case '\u03C8': | ||
output[outputPos++] = 'p'; | ||
output[outputPos++] = 's'; | ||
output[outputPos++] = 'i'; | ||
break; | ||
case '\u03A9': | ||
case '\u03C9': | ||
output[outputPos++] = 'o'; | ||
output[outputPos++] = 'm'; | ||
output[outputPos++] = 'e'; | ||
output[outputPos++] = 'g'; | ||
output[outputPos++] = 'a'; | ||
break; | ||
default: | ||
output[outputPos++] = c; | ||
break; | ||
} | ||
} | ||
} | ||
return outputPos; | ||
} | ||
} |
56 changes: 56 additions & 0 deletions
56
.../src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package org.apache.lucene.analysis.miscellaneous; | ||
|
||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
import java.util.Map; | ||
|
||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory; | ||
import org.apache.lucene.analysis.util.MultiTermAwareComponent; | ||
import org.apache.lucene.analysis.util.TokenFilterFactory; | ||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; | ||
import org.apache.lucene.analysis.TokenStream; | ||
|
||
/** | ||
* Factory for {@link ASCIIFoldingFilter}. | ||
* <pre class="prettyprint" > | ||
* <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100"> | ||
* <analyzer> | ||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/> | ||
* <filter class="solr.ASCIIFoldingFilterFactory"/> | ||
* </analyzer> | ||
* </fieldType></pre> | ||
* | ||
*/ | ||
public class AdsSpecialCharactersFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { | ||
public AdsSpecialCharactersFilterFactory(Map<String, String> args) { | ||
super(args); | ||
if (!args.isEmpty()) { | ||
throw new IllegalArgumentException("Unknown parameters: " + args); | ||
} | ||
} | ||
|
||
public AdsSpecialCharactersFilter create(TokenStream input) { | ||
return new AdsSpecialCharactersFilter(input); | ||
} | ||
|
||
@Override | ||
public AbstractAnalysisFactory getMultiTermComponent() { | ||
return this; | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.