Skip to content

Commit

Permalink
include stopword list, fix issues with stopword filter
Browse files Browse the repository at this point in the history
  • Loading branch information
eroux committed Jan 19, 2018
1 parent 3e548b0 commit 6edb146
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 13 deletions.
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@
<include>total_lexicon.txt</include>
</includes>
</resource>
<resource>
<directory>src/main/resources/</directory>
<includes>
<include>*</include>
</includes>
</resource>
</resources>
<plugins>
<plugin>
Expand Down
31 changes: 18 additions & 13 deletions src/main/java/io/bdrc/lucene/bo/TibetanAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.IOUtils;

Expand Down Expand Up @@ -76,7 +76,7 @@ public TibetanAnalyzer(boolean segmentInWords, boolean lemmatize, boolean filter
if (stopFilename != null) {
if (stopFilename.isEmpty()) {
InputStream stream = null;
stream = TibetanAnalyzer.class.getResourceAsStream("/bo-stopwords.txt");
stream = TibetanAnalyzer.class.getResourceAsStream("bo-stopwords.txt");
if (stream == null) { // we're not using the jar, there is no resource, assuming we're running the code
this.tibStopSet = null;
} else {
Expand Down Expand Up @@ -149,29 +149,34 @@ protected Reader initReader(String fieldName, Reader reader) {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer source = null;
TokenStream filter = null;
TokenFilter filter = null;

if (segmentInWords) {
try {
source = new TibWordTokenizer();
if (lemmatize) {
((TibWordTokenizer) source).setLemmatize(lemmatize);
}
((TibWordTokenizer) source).setLemmatize(lemmatize);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
filter = new StopFilter(source, tibStopSet);

} else {
source = new TibSyllableTokenizer();
if (lemmatize) {
filter = (TibAffixedFilter) new TibAffixedFilter(source);
filter = new TibAffixedFilter(source);
}
filter = new StopFilter(filter, tibStopSet);
}

return new TokenStreamComponents(source, filter);
}
if (tibStopSet != null) {
if (filter != null) {
filter = new StopFilter(filter, tibStopSet);
} else {
filter = new StopFilter(source, tibStopSet);
}
}
if (filter != null) {
return new TokenStreamComponents(source, filter);
} else {
return new TokenStreamComponents(source);
}
}
}

0 comments on commit 6edb146

Please sign in to comment.