add ewts converter (may need some optimizations, may not)

buda-base · Aug 25, 2017 · e2b2f26 · e2b2f26
1 parent 65633ce
commit e2b2f26
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 77 deletions.
diff --git a/src/main/java/io/bdrc/lucene/bo/TibEwtsFilter.java b/src/main/java/io/bdrc/lucene/bo/TibEwtsFilter.java
@@ -0,0 +1,106 @@
+package io.bdrc.lucene.bo;
+
+import java.io.IOException;
+import java.io.Reader;
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
+import org.apache.lucene.analysis.util.RollingCharBuffer;
+
+import io.bdrc.ewtsconverter.EwtsConverter;
+
+/**
+ * A filter that converts EWTS input into Tibetan Unicode
+ * 
+ * Partially inpired from Lucene 6 org.apache.lucene.analysis.charfilterMappingCharFilter
+ * 
+ * @author Elie Roux
+ **/
+public class TibEwtsFilter extends BaseCharFilter {
+
+	public static final EwtsConverter converter = new EwtsConverter(false, false, false, true);
+
+	private final RollingCharBuffer buffer = new RollingCharBuffer();
+	private final int MAX_EWTS_LEN = 32;
+	private String replacement = null;
+	private int replacementIdx = -1;
+	private int replacementLen = -1;
+	private int inputOff;
+	StringBuilder tmpEwts;
+
+	public TibEwtsFilter(Reader in) {
+		super(in);
+		buffer.reset(in);
+		inputOff = 0;
+	}
+
+	@Override
+	public void reset() throws IOException {
+		input.reset();
+		buffer.reset(input);
+		replacement = null;
+		inputOff = 0;
+	}
+
+	@Override
+	public int read() throws IOException {
+		if (replacement != null && replacementIdx < replacementLen) {
+			return replacement.charAt(replacementIdx++);
+		}
+		replacement = null;
+		replacementIdx = 0;
+		tmpEwts = new StringBuilder();
+		int initialInputOff = inputOff;
+		while (true) {
+			int c = buffer.get(inputOff);
+			if (c == -1) {
+				replacement = tmpEwts.length() > 0 ? converter.toUnicode(tmpEwts.toString()) : null;
+				break;
+			}
+			inputOff = inputOff +1;
+			tmpEwts.append((char) c);
+			if (c == ' ' || c == '*' || c == '_' || c > 127 || inputOff - initialInputOff > MAX_EWTS_LEN) {
+				replacement = converter.toUnicode(tmpEwts.toString());
+				break;
+			}
+		}
+		buffer.freeBefore(inputOff);
+		if (replacement == null || replacement.isEmpty()) {
+			return -1;
+		}
+		int diff = (inputOff - initialInputOff) - replacement.length();
+		// verbatim from charfilterMappingCharFilter
+        if (diff != 0) {
+            final int prevCumulativeDiff = getLastCumulativeDiff();
+            if (diff > 0) {
+              addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
+            } else {
+              final int outputStart = inputOff - prevCumulativeDiff;
+              for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
+                addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
+              }
+            }
+        }
+		replacementIdx = 1;
+		replacementLen = replacement.length();
+		return replacement.charAt(0);
+	}
+
+	@Override
+	public int read(char[] cbuf, int off, int len) throws IOException {
+		System.out.println("calling read with off="+off+", len="+len);
+	    int numRead = 0;
+	    for(int i = off; i < off + len; i++) {
+	    	int c = read();
+	    	if (c != -1) {
+	    		System.out.println("reading "+(char)c);
+	    	}
+	      	if (c == -1) {
+	      		System.out.println("reached end of input");
+	      		break;
+	      	}
+	      	cbuf[i] = (char) c;
+	      	numRead++;
+	    }
+
+	    return numRead == 0 ? -1 : numRead;
+	}
+}
diff --git a/src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java b/src/test/java/io/bdrc/lucene/bo/TibetanAnalyzerTest.java
@@ -36,10 +36,6 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import io.bdrc.lucene.stemmer.Optimizer;
-import io.bdrc.lucene.stemmer.Row;
-import io.bdrc.lucene.stemmer.Trie;
-
 import static org.hamcrest.CoreMatchers.*;
 
 /**
@@ -120,78 +116,6 @@ public boolean isTibLetter(int c) {
 		return ('\u0F40' <= c && c <= '\u0FBC');
 	}
 
-	/**
-	 *  this function is inspired from getLastOnPath() in stemmer's Trie.java
-	 * @param toAnalyze the string to analyse
-	 * @param startCharIndex the index from which we want to analyze
-	 * @param t the Trie containing the data
-	 */
-	// 
-	public void produceOneToken(String toAnalyze, int startCharIndex, Trie t) {
-		// getting the root of the tree
-//		System.out.println(toAnalyze);
-		Row now = t.getRow(t.getRoot());
-		int w; // temporary index variable
-		int lastCharIndex = -1; // the index of the last match in the string we analyze
-		int lastCmdIndex = -1; // the index (inside the Trie) of the cmd corresponding to the last match
-
-		int i = startCharIndex; // the current index in the string
-		while (i < toAnalyze.length()) {
-			Character ch = toAnalyze.charAt(i); // get the current character
-//			System.out.println("moving to index "+i+": "+ch);
-			w = now.getCmd(ch); // get the command associated with the current character at next step in the Trie
-			if (w >= 0) {
-				if (i >= toAnalyze.length()-1 || !isTibLetter(toAnalyze.charAt(i+1))) {
-//						System.out.println("current row has an command for it, so it's a match");
-						lastCmdIndex = w;
-						lastCharIndex = i;
-					}
-            } else {
-//            	System.out.println("current row does not have a command for it, no match");
-            }
-			w = now.getRef(ch); // get the next row if there is one
-			if (w >= 0) {
-//				System.out.println("current row does have a reference for this char, further matches are possible, moving one row forward in the Trie");
-                now = t.getRow(w);
-            } else {
-//            	System.out.println("current row does not have a reference to this char, so there's no further possible match, breaking the loop");
-                break; // no more steps possible in our research
-            }
-			i++;
-		}
-		//w = now.getCmd(toAnalyze.charAt(i));
-		if (lastCharIndex == -1) {
-//			System.out.println("I have found nothing");
-			return;
-		}
-//		System.out.println("I have found a token that goes from "+startCharIndex+" to "
-//				+ lastCharIndex);
-//		System.out.println("the substring is: "+toAnalyze.substring(startCharIndex, lastCharIndex+1));
-//		System.out.println("the command associated with this token in the Trie is: "+t.getCommandVal(lastCmdIndex));
-	}
-
-	@Test
-	public void produceOneTokenTest() throws IOException
-	{
-		System.out.println("Testing Stemmer Trie (produceOneToken() )");
-		Trie test = new Trie(true);
-		test.add("དྲོའི",">a");
-		test.add("བདེ་ལེགས","=");
-		test.add("བདེ", "=");
-		test.add("བཀྲ་ཤིས","=");
-		test.add("བཀྲ", "=");
-		test.add("དྲོ","=");
-		test.add("དགའི", ">A");
-		test.add("དགའ","=");
-		Optimizer opt = new Optimizer();
-		test.reduce(opt);
-		produceOneToken("དག", 0, test);
-		produceOneToken("དགའི", 0, test);
-		produceOneToken("བཀྲ་", 0, test);
-		produceOneToken("བཀྲད", 0, test);
-		produceOneToken("བདེ་ལེགས", 0, test);
-	}
-
 	@Test
 	public void wordTokenizerLemmatizeTest() throws IOException
 	{
@@ -231,7 +155,19 @@ public void mappingCharFilterTest() throws IOException
 		TokenStream res = tokenize(new TibCharFilter(reader), new TibSyllableTokenizer());
 		assertTokenStream(res, expected);
 	}
-
+
+	@Test
+	public void ewtsFilterTest() throws IOException
+	{
+		System.out.println("Testing TibEwtsFilter()");
+		String input = "bod rgyal lo invalid བོད";
+		Reader reader = new StringReader(input);
+		List<String> expected = Arrays.asList("བོད", "རྒྱལ", "ལོ",  "ཨིནབ",  "ལིད",  "བོད");
+		System.out.print(input + " => ");
+		TokenStream res = tokenize(new TibEwtsFilter(reader), new TibSyllableTokenizer());
+		assertTokenStream(res, expected);
+	}
+
 	@Test
 	public void bugEatenSyllable() throws IOException
 	{