Skip to content

Commit

Permalink
Optionally analyze compound word parts
Browse files Browse the repository at this point in the history
  • Loading branch information
mortterna authored and komu committed Jan 31, 2023
1 parent 3c540b0 commit 6a1365a
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 23 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,13 @@ Include `finnish` tokenizer and `raudikko` filter in your analyzer, for example:

You can use the following filter options to customize the behaviour of the filter:

| Parameter | Default value | Description |
|-------------------|------------------|--------------------------------------------------|
| analyzeAll | true | Use all analysis possibilities or just the first |
| minimumWordSize | 3 | minimum length of words to analyze |
| maximumWordSize | 100 | maximum length of words to analyze |
| analysisCacheSize | 1024 | number of analysis results to cache |
| Parameter | Default value | Description |
|---------------------|---------------|--------------------------------------------------|
| analyzeAll | true | Use all analysis possibilities or just the first |
| splitCompoundWords | false | Split analysed compound words to its parts |
| minimumWordSize | 3 | minimum length of words to analyze |
| maximumWordSize | 100 | maximum length of words to analyze |
| analysisCacheSize | 1024 | number of analysis results to cache |

## Compatibility with elasticsearch-analysis-voikko

Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repositories {

dependencies {
compileOnly("org.elasticsearch:elasticsearch:$elasticsearchVersion")
implementation("fi.evident.raudikko:raudikko:0.1.1")
implementation("fi.evident.raudikko:raudikko:0.1.2")

testImplementation("org.elasticsearch:elasticsearch:$elasticsearchVersion")
testImplementation(platform("org.junit:junit-bom:5.7.0"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package fi.evident.elasticsearch.raudikko.analysis;

import fi.evident.raudikko.Analysis;
import fi.evident.raudikko.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
Expand Down Expand Up @@ -71,16 +72,16 @@ private void analyzeToken() {
if (!isCandidateForAnalysis(charTermAttribute))
return;

List<String> baseForms = analyze(charTermAttribute);
if (baseForms.isEmpty())
List<String> analysis = analyze(charTermAttribute);
if (analysis.isEmpty())
return;

charTermAttribute.setEmpty().append(baseForms.get(0));
charTermAttribute.setEmpty().append(analysis.get(0));

if (cfg.analyzeAll && baseForms.size() > 1) {
if ((cfg.analyzeAll || cfg.splitCompoundWords) && analysis.size() > 1) {
current = captureState();

alternatives.addAll(baseForms.subList(1, baseForms.size()));
alternatives.addAll(analysis.subList(1, analysis.size()));
}
}

Expand All @@ -95,16 +96,31 @@ private List<String> analyze(CharSequence wordSeq) {
}

private List<String> analyzeUncached(String word) {
List<String> results = raudikkoAnalyzer.baseForms(word);

switch (results.size()) {
case 0:
return Collections.emptyList();
case 1:
return Collections.singletonList(results.get(0));
default:
return new ArrayList<>(results);

List<Analysis> analysisResults = raudikkoAnalyzer.analyze(word);

if (analysisResults.isEmpty())
return Collections.emptyList();

List<String> results = new ArrayList<>();

for (Analysis analysis : analysisResults) {
String baseForm = analysis.getBaseForm();
if (baseForm != null && !results.contains(baseForm)) {
results.add(baseForm);
}

List<String> baseFormParts = analysis.getBaseFormParts();
if (baseFormParts != null) {
for (String baseFormPart : baseFormParts) {
if (!results.contains(baseFormPart)) {
results.add(baseFormPart);
}
}
}
}

return results;
}

private void outputAlternative(String token) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ public class RaudikkoTokenFilterConfiguration {
/** If true, use analysis candidates returned by Raudikko, otherwise use only the first result. */
boolean analyzeAll = true;

/** If true, split compound words to parts */
boolean splitCompoundWords = false;

/** Words shorter than this threshold are ignored */
int minimumWordSize = 3;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package fi.evident.elasticsearch.raudikko.analysis;

import fi.evident.raudikko.AnalyzerConfiguration;
import fi.evident.raudikko.Morphology;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.settings.Settings;
Expand All @@ -39,6 +40,7 @@ public RaudikkoTokenFilterFactory(IndexSettings indexSettings,
super(name, settings);

cfg.analyzeAll = settings.getAsBoolean("analyzeAll", cfg.analyzeAll);
cfg.splitCompoundWords = settings.getAsBoolean("splitCompoundWords", cfg.splitCompoundWords);
cfg.minimumWordSize = settings.getAsInt("minimumWordSize", cfg.minimumWordSize);
cfg.maximumWordSize = settings.getAsInt("maximumWordSize", cfg.maximumWordSize);

Expand All @@ -50,6 +52,21 @@ public RaudikkoTokenFilterFactory(IndexSettings indexSettings,

@Override
public TokenStream create(TokenStream tokenStream) {
return new RaudikkoTokenFilter(tokenStream, morphology.newAnalyzer(), analysisCache, cfg);
return createTokenFilter(tokenStream, cfg, analysisCache, morphology);
}

public static RaudikkoTokenFilter createTokenFilter(TokenStream tokenStream,
RaudikkoTokenFilterConfiguration filterCfg,
AnalysisCache analysisCache,
Morphology morphology) {
AnalyzerConfiguration analyzerCfg = new AnalyzerConfiguration();
analyzerCfg.setIncludeStructure(false);
analyzerCfg.setIncludeBasicAttributes(false);
analyzerCfg.setIncludeFstOutput(false);
analyzerCfg.setIncludeOrganizationNameAnalysis(false);
analyzerCfg.setIncludeBaseForm(true);
analyzerCfg.setIncludeBaseFormParts(filterCfg.splitCompoundWords);

return new RaudikkoTokenFilter(tokenStream, morphology.newAnalyzer(analyzerCfg), analysisCache, filterCfg);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ public void testCompoundWordsWithHyphens() {
assertTokens("rippi-isälle", token("rippi-isälle", "rippi-isä", 1));
}

@Test
public void testCompoundWordsWithParts() {
configuration.splitCompoundWords = true;
assertTokens("lammasfarmeineen",
token("lammasfarmeineen", "lammasfarmi", 1),
token("lammasfarmeineen", "lammas", 0),
token("lammasfarmeineen", "farmi", 0)
);
assertTokens("tämä on moniosainen lammasfarmi",
token("tämä", "tämä", 1),
token("on", "on", 1),
token("moniosainen", "moniosainen", 1),
token("moniosainen", "moni", 0),
token("moniosainen", "osa", 0),
token("lammasfarmi", "lammasfarmi", 1),
token("lammasfarmi", "lammas", 0),
token("lammasfarmi", "farmi", 0)
);
}

private static TokenData token(String original, String token, int positionIncrement) {
return new TokenData(original, token, positionIncrement);
}
Expand All @@ -110,7 +130,7 @@ private List<TokenData> parse(String text) {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new FinnishTokenizer();
TokenStream filter = new RaudikkoTokenFilter(source, MorphologyFactory.getInstance().newAnalyzer(), new AnalysisCache(100), configuration);
TokenStream filter = RaudikkoTokenFilterFactory.createTokenFilter(source, configuration, new AnalysisCache(100), MorphologyFactory.getInstance());
return new TokenStreamComponents(source, filter);
}
};
Expand Down

0 comments on commit 6a1365a

Please sign in to comment.