Skip to content

Commit

Permalink
Merged with the master
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Nov 10, 2014
2 parents fa235b3 + 28f9d8b commit dc32c6a
Show file tree
Hide file tree
Showing 25 changed files with 1,073 additions and 453 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
package org.apache.lucene.analysis.miscellaneous;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

/**
*
* This is a modified ASCIIFoldingFilter
*
* It translates greek math symbols
*
* For example, 'γ' will be replaced by 'gamma'.
*/
public final class AdsSpecialCharactersFilter extends TokenFilter {
public AdsSpecialCharactersFilter(TokenStream input)
{
super(input);
}

private char[] output = new char[512];
private int outputPos;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.buffer();
final int length = termAtt.length();

// If no characters actually require rewriting then we
// just return token as-is:
for(int i = 0 ; i < length ; ++i) {
final char c = buffer[i];
if (c >= '\u0080')
{
foldToASCII(buffer, length);
termAtt.copyBuffer(output, 0, outputPos);
break;
}
}
return true;
} else {
return false;
}
}

/**
* Converts characters above ASCII to their ASCII equivalents. For example,
* accents are removed from accented characters.
* @param input The string to fold
* @param length The number of characters in the input string
*/
public void foldToASCII(char[] input, int length)
{
// Worst-case length required:
final int maxSizeNeeded = 4 * length;
if (output.length < maxSizeNeeded) {
output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)];
}

outputPos = foldToASCII(input, 0, output, 0, length);
}

/**
* Converts characters above ASCII to their ASCII equivalents. For example,
* accents are removed from accented characters.
* @param input The characters to fold
* @param inputPos Index of the first character to fold
* @param output The result of the folding. Should be of size >= {@code length * 4}.
* @param outputPos Index of output where to put the result of the folding
* @param length The number of characters to fold
* @return length of output
* @lucene.internal
*/
public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length)
{
final int end = inputPos + length;
for (int pos = inputPos; pos < end ; ++pos) {
final char c = input[pos];

// Quick test: if it's not in range then just keep current character
if (c < '\u0080') {
output[outputPos++] = c;
} else {
switch (c) {
case '\u0391':
case '\u03B1':
output[outputPos++] = 'a';
output[outputPos++] = 'l';
output[outputPos++] = 'p';
output[outputPos++] = 'h';
output[outputPos++] = 'a';
break;
case '\u0392':
case '\u03B2':
output[outputPos++] = 'b';
output[outputPos++] = 'e';
output[outputPos++] = 't';
output[outputPos++] = 'a';
break;
case '\u0393':
case '\u03B3':
output[outputPos++] = 'g';
output[outputPos++] = 'a';
output[outputPos++] = 'm';
output[outputPos++] = 'm';
output[outputPos++] = 'a';
break;
case '\u0394':
case '\u03B4':
output[outputPos++] = 'd';
output[outputPos++] = 'e';
output[outputPos++] = 'l';
output[outputPos++] = 't';
output[outputPos++] = 'a';
break;
case '\u0395':
case '\u03B5':
output[outputPos++] = 'e';
output[outputPos++] = 'p';
output[outputPos++] = 's';
output[outputPos++] = 'i';
output[outputPos++] = 'l';
output[outputPos++] = 'o';
output[outputPos++] = 'n';
break;
case '\u0396':
case '\u03B6':
output[outputPos++] = 'z';
output[outputPos++] = 'e';
output[outputPos++] = 't';
output[outputPos++] = 'a';
break;
case '\u0397':
case '\u03B7':
output[outputPos++] = 'e';
output[outputPos++] = 't';
output[outputPos++] = 'a';
break;
case '\u0398':
case '\u03B8':
output[outputPos++] = 't';
output[outputPos++] = 'h';
output[outputPos++] = 'e';
output[outputPos++] = 't';
output[outputPos++] = 'a';
break;
case '\u0399':
case '\u03B9':
output[outputPos++] = 'i';
output[outputPos++] = 'o';
output[outputPos++] = 't';
output[outputPos++] = 'a';
break;
case '\u039A':
case '\u03BA':
output[outputPos++] = 'k';
output[outputPos++] = 'a';
output[outputPos++] = 'p';
output[outputPos++] = 'p';
output[outputPos++] = 'a';
break;
case '\u039B':
case '\u03BB':
output[outputPos++] = 'l';
output[outputPos++] = 'a';
output[outputPos++] = 'm';
output[outputPos++] = 'b';
output[outputPos++] = 'd';
output[outputPos++] = 'a';
break;
case '\u039C':
case '\u03BC':
output[outputPos++] = 'm';
output[outputPos++] = 'u';
break;
case '\u039D':
case '\u03BD':
output[outputPos++] = 'n';
output[outputPos++] = 'u';
break;
case '\u039E':
case '\u03BE':
output[outputPos++] = 'x';
output[outputPos++] = 'i';
break;
case '\u039F':
case '\u03BF':
output[outputPos++] = 'o';
output[outputPos++] = 'm';
output[outputPos++] = 'i';
output[outputPos++] = 'c';
output[outputPos++] = 'r';
output[outputPos++] = 'o';
output[outputPos++] = 'n';
break;
case '\u03A0':
case '\u03C0':
output[outputPos++] = 'p';
output[outputPos++] = 'i';
break;
case '\u03A1':
case '\u03C1':
output[outputPos++] = 'r';
output[outputPos++] = 'h';
output[outputPos++] = 'o';
break;
case '\u03A3':
case '\u03C3':
output[outputPos++] = 's';
output[outputPos++] = 'i';
output[outputPos++] = 'g';
output[outputPos++] = 'm';
output[outputPos++] = 'a';
break;
case '\u03A4':
case '\u03C4':
output[outputPos++] = 't';
output[outputPos++] = 'a';
output[outputPos++] = 'u';
break;
case '\u03A5':
case '\u03C5':
output[outputPos++] = 'u';
output[outputPos++] = 'p';
output[outputPos++] = 's';
output[outputPos++] = 'i';
output[outputPos++] = 'l';
output[outputPos++] = 'o';
output[outputPos++] = 'n';
break;
case '\u03A6':
case '\u03C6':
output[outputPos++] = 'p';
output[outputPos++] = 'h';
output[outputPos++] = 'i';
break;
case '\u03A7':
case '\u03C7':
output[outputPos++] = 'c';
output[outputPos++] = 'h';
output[outputPos++] = 'i';
break;
case '\u03A8':
case '\u03C8':
output[outputPos++] = 'p';
output[outputPos++] = 's';
output[outputPos++] = 'i';
break;
case '\u03A9':
case '\u03C9':
output[outputPos++] = 'o';
output[outputPos++] = 'm';
output[outputPos++] = 'e';
output[outputPos++] = 'g';
output[outputPos++] = 'a';
break;
default:
output[outputPos++] = c;
break;
}
}
}
return outputPos;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package org.apache.lucene.analysis.miscellaneous;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.util.Map;

import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.TokenStream;

/**
* Factory for {@link ASCIIFoldingFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.ASCIIFoldingFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class AdsSpecialCharactersFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public AdsSpecialCharactersFilterFactory(Map<String, String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}

public AdsSpecialCharactersFilter create(TokenStream input) {
return new AdsSpecialCharactersFilter(input);
}

@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,18 @@
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNodeImpl;
import org.apache.lucene.queryparser.flexible.core.parser.EscapeQuerySyntax;
import org.apache.lucene.search.CacheWrapper;
import org.apache.lucene.search.SolrCacheWrapper;


public class PythonQueryNode extends QueryNodeImpl implements QueryNode {

private static final long serialVersionUID = 3935454544149998076L;
private boolean useIntBits;
private SolrCacheWrapper cacheWrapper;
private boolean useIntBits = false;
@SuppressWarnings("rawtypes")
private SolrCacheWrapper cacheWrapper;
private String pythonFunctionName;

public PythonQueryNode(QueryNode query, SolrCacheWrapper cache) {
@SuppressWarnings("rawtypes")
public PythonQueryNode(QueryNode query, SolrCacheWrapper cache) {
if (query == null) {
throw new QueryNodeError(new MessageImpl(
QueryParserMessages.PARAMETER_VALUE_NOT_SUPPORTED, "query",
Expand Down Expand Up @@ -64,7 +65,7 @@ public QueryNode getChild() {
* between java na python
*/
public boolean useIntBitSet() {
return false;
return useIntBits;
}

public void setIntBitSet(boolean v) {
Expand All @@ -77,11 +78,13 @@ public void setIntBitSet(boolean v) {
* lucene ids. This wrapper should provide that functionality
* This should not be null
*/
public SolrCacheWrapper getCacheWrapper() {
@SuppressWarnings("rawtypes")
public SolrCacheWrapper getCacheWrapper() {
return cacheWrapper;
}

public void setCacheWrapper(SolrCacheWrapper cache) {
@SuppressWarnings("rawtypes")
public void setCacheWrapper(SolrCacheWrapper cache) {
cacheWrapper = cache;
}

Expand Down
Loading

0 comments on commit dc32c6a

Please sign in to comment.