Skip to content

Commit

Permalink
add ewts converter (may need some optimizations, may not)
Browse files Browse the repository at this point in the history
  • Loading branch information
eroux committed Aug 25, 2017
1 parent 65633ce commit e2b2f26
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 77 deletions.
106 changes: 106 additions & 0 deletions src/main/java/io/bdrc/lucene/bo/TibEwtsFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package io.bdrc.lucene.bo;

import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
import org.apache.lucene.analysis.util.RollingCharBuffer;

import io.bdrc.ewtsconverter.EwtsConverter;

/**
* A filter that converts EWTS input into Tibetan Unicode
*
* Partially inpired from Lucene 6 org.apache.lucene.analysis.charfilterMappingCharFilter
*
* @author Elie Roux
**/
public class TibEwtsFilter extends BaseCharFilter {

public static final EwtsConverter converter = new EwtsConverter(false, false, false, true);

private final RollingCharBuffer buffer = new RollingCharBuffer();
private final int MAX_EWTS_LEN = 32;
private String replacement = null;
private int replacementIdx = -1;
private int replacementLen = -1;
private int inputOff;
StringBuilder tmpEwts;

public TibEwtsFilter(Reader in) {
super(in);
buffer.reset(in);
inputOff = 0;
}

@Override
public void reset() throws IOException {
input.reset();
buffer.reset(input);
replacement = null;
inputOff = 0;
}

@Override
public int read() throws IOException {
if (replacement != null && replacementIdx < replacementLen) {
return replacement.charAt(replacementIdx++);
}
replacement = null;
replacementIdx = 0;
tmpEwts = new StringBuilder();
int initialInputOff = inputOff;
while (true) {
int c = buffer.get(inputOff);
if (c == -1) {
replacement = tmpEwts.length() > 0 ? converter.toUnicode(tmpEwts.toString()) : null;
break;
}
inputOff = inputOff +1;
tmpEwts.append((char) c);
if (c == ' ' || c == '*' || c == '_' || c > 127 || inputOff - initialInputOff > MAX_EWTS_LEN) {
replacement = converter.toUnicode(tmpEwts.toString());
break;
}
}
buffer.freeBefore(inputOff);
if (replacement == null || replacement.isEmpty()) {
return -1;
}
int diff = (inputOff - initialInputOff) - replacement.length();
// verbatim from charfilterMappingCharFilter
if (diff != 0) {
final int prevCumulativeDiff = getLastCumulativeDiff();
if (diff > 0) {
addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
} else {
final int outputStart = inputOff - prevCumulativeDiff;
for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
}
}
}
replacementIdx = 1;
replacementLen = replacement.length();
return replacement.charAt(0);
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
System.out.println("calling read with off="+off+", len="+len);
int numRead = 0;
for(int i = off; i < off + len; i++) {
int c = read();
if (c != -1) {
System.out.println("reading "+(char)c);
}
if (c == -1) {
System.out.println("reached end of input");
break;
}
cbuf[i] = (char) c;
numRead++;
}

return numRead == 0 ? -1 : numRead;
}
}
90 changes: 13 additions & 77 deletions src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@
import org.junit.BeforeClass;
import org.junit.Test;

import io.bdrc.lucene.stemmer.Optimizer;
import io.bdrc.lucene.stemmer.Row;
import io.bdrc.lucene.stemmer.Trie;

import static org.hamcrest.CoreMatchers.*;

/**
Expand Down Expand Up @@ -120,78 +116,6 @@ public boolean isTibLetter(int c) {
return ('\u0F40' <= c && c <= '\u0FBC');
}

/**
* this function is inspired from getLastOnPath() in stemmer's Trie.java
* @param toAnalyze the string to analyse
* @param startCharIndex the index from which we want to analyze
* @param t the Trie containing the data
*/
//
public void produceOneToken(String toAnalyze, int startCharIndex, Trie t) {
// getting the root of the tree
// System.out.println(toAnalyze);
Row now = t.getRow(t.getRoot());
int w; // temporary index variable
int lastCharIndex = -1; // the index of the last match in the string we analyze
int lastCmdIndex = -1; // the index (inside the Trie) of the cmd corresponding to the last match

int i = startCharIndex; // the current index in the string
while (i < toAnalyze.length()) {
Character ch = toAnalyze.charAt(i); // get the current character
// System.out.println("moving to index "+i+": "+ch);
w = now.getCmd(ch); // get the command associated with the current character at next step in the Trie
if (w >= 0) {
if (i >= toAnalyze.length()-1 || !isTibLetter(toAnalyze.charAt(i+1))) {
// System.out.println("current row has an command for it, so it's a match");
lastCmdIndex = w;
lastCharIndex = i;
}
} else {
// System.out.println("current row does not have a command for it, no match");
}
w = now.getRef(ch); // get the next row if there is one
if (w >= 0) {
// System.out.println("current row does have a reference for this char, further matches are possible, moving one row forward in the Trie");
now = t.getRow(w);
} else {
// System.out.println("current row does not have a reference to this char, so there's no further possible match, breaking the loop");
break; // no more steps possible in our research
}
i++;
}
//w = now.getCmd(toAnalyze.charAt(i));
if (lastCharIndex == -1) {
// System.out.println("I have found nothing");
return;
}
// System.out.println("I have found a token that goes from "+startCharIndex+" to "
// + lastCharIndex);
// System.out.println("the substring is: "+toAnalyze.substring(startCharIndex, lastCharIndex+1));
// System.out.println("the command associated with this token in the Trie is: "+t.getCommandVal(lastCmdIndex));
}

@Test
public void produceOneTokenTest() throws IOException
{
System.out.println("Testing Stemmer Trie (produceOneToken() )");
Trie test = new Trie(true);
test.add("དྲོའི",">a");
test.add("བདེ་ལེགས","=");
test.add("བདེ", "=");
test.add("བཀྲ་ཤིས","=");
test.add("བཀྲ", "=");
test.add("དྲོ","=");
test.add("དགའི", ">A");
test.add("དགའ","=");
Optimizer opt = new Optimizer();
test.reduce(opt);
produceOneToken("དག", 0, test);
produceOneToken("དགའི", 0, test);
produceOneToken("བཀྲ་", 0, test);
produceOneToken("བཀྲད", 0, test);
produceOneToken("བདེ་ལེགས", 0, test);
}

@Test
public void wordTokenizerLemmatizeTest() throws IOException
{
Expand Down Expand Up @@ -231,7 +155,19 @@ public void mappingCharFilterTest() throws IOException
TokenStream res = tokenize(new TibCharFilter(reader), new TibSyllableTokenizer());
assertTokenStream(res, expected);
}


@Test
public void ewtsFilterTest() throws IOException
{
System.out.println("Testing TibEwtsFilter()");
String input = "bod rgyal lo invalid བོད";
Reader reader = new StringReader(input);
List<String> expected = Arrays.asList("བོད", "རྒྱལ", "ལོ", "ཨིནབ", "ལིད", "བོད");
System.out.print(input + " => ");
TokenStream res = tokenize(new TibEwtsFilter(reader), new TibSyllableTokenizer());
assertTokenStream(res, expected);
}

@Test
public void bugEatenSyllable() throws IOException
{
Expand Down

0 comments on commit e2b2f26

Please sign in to comment.