Skip to content

Commit

Permalink
fvh boundary scanner customization (#580)
Browse files Browse the repository at this point in the history
 fvh boundary scanner customization
  • Loading branch information
waziqi89 authored Jun 22, 2023
1 parent 26d68fa commit 0aec087
Show file tree
Hide file tree
Showing 10 changed files with 510 additions and 72 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ sourceCompatibility = 1.14
targetCompatibility = 1.14

allprojects {
version = '0.25.0'
version = '0.26.0'
group = 'com.yelp.nrtsearch'
}

Expand Down
8 changes: 8 additions & 0 deletions clientlib/src/main/proto/yelp/nrtsearch/search.proto
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,14 @@ message Highlight {
string custom_highlighter_name = 11;
// Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen.
google.protobuf.Struct custom_highlighter_params = 12;
// Define the boundary decision when creating fragments. Options are "simple" (default in fast vector highlighter), "word" or "sentence".
google.protobuf.StringValue boundary_scanner = 13;
// Terminating chars when using "simple" boundary_scanner. The default is ".,!? \t\n".
google.protobuf.StringValue boundary_chars = 14;
// Number of chars to scan before finding the boundary_chars if using "simple" boundary scanner; If "boundary_chars" is not found after max scan, fragments will start/end at the original place. Default is 20.
google.protobuf.UInt32Value boundary_max_scan = 15;
// Locale used in boundary scanner when using "word" or "sentence" boundary_scanner. Examples: "en-US", "ch-ZH".
google.protobuf.StringValue boundary_scanner_locale = 16;
}

// Highlight settings
Expand Down
8 changes: 8 additions & 0 deletions docs/highlighting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ This is the proto definition for Highlight message which can be specified in Sea
string custom_highlighter_name = 11;
// Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen.
google.protobuf.Struct custom_highlighter_params = 12;
// Define the boundary decision when creating fragments. Options are "boundary_chars" (default in fast vector highlighter), "word" or "sentence".
google.protobuf.StringValue boundary_scanner = 13;
// Terminating chars when using "boundary_chars" boundary_scanner. The default is ".,!? \t\n".
google.protobuf.StringValue boundary_chars = 14;
// Number of chars to scan before finding the boundary_chars if using "simple" boundary scanner; If "boundary_chars" is not found after max scan, fragments will start/end at the original place. Default is 20.
google.protobuf.UInt32Value boundary_max_scan = 15;
// Locale used in boundary scanner when using "word" or "sentence" boundary_scanner. Examples: "en-US", "ch-ZH".
google.protobuf.StringValue boundary_scanner_locale = 16;
}
// Highlight settings
Expand Down
17 changes: 17 additions & 0 deletions grpc-gateway/luceneserver.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -1696,6 +1696,23 @@
"custom_highlighter_params": {
"type": "object",
"description": "Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen."
},
"boundary_scanner": {
"type": "string",
"description": "Define the boundary decision when creating fragments. Options are \"simple\" (default in fast vector highlighter), \"word\" or \"sentence\"."
},
"boundary_chars": {
"type": "string",
"description": "Terminating chars when using \"simple\" boundary_scanner. The default is \".,!? \\t\\n\"."
},
"boundary_max_scan": {
"type": "integer",
"format": "int64",
"description": "Number of chars to scan before finding the boundary_chars if using \"simple\" boundary scanner; If \"boundary_chars\" is not found after max scan, fragments will start/end at the original place. Default is 20."
},
"boundary_scanner_locale": {
"type": "string",
"description": "Locale used in boundary scanner when using \"word\" or \"sentence\" boundary_scanner. Examples: \"en-US\", \"ch-ZH\"."
}
}
},
Expand Down
197 changes: 128 additions & 69 deletions grpc-gateway/search.pb.go

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package com.yelp.nrtsearch.server.luceneserver.highlights;

import java.util.Arrays;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.search.Query;

Expand All @@ -33,6 +34,10 @@ public class HighlightSettings {
private final String fragmenter;
private final boolean discreteMultivalue;
private final Map<String, Object> customHighlighterParams;
private final String boundaryScanner;
private final Character[] boundaryChars;
private final int boundaryMaxScan;
private final Locale boundaryScannerLocale;

public HighlightSettings(
Highlighter highlighter,
Expand All @@ -45,6 +50,10 @@ public HighlightSettings(
boolean scoreOrdered,
String fragmenter,
boolean discreteMultivalue,
String boundaryScanner,
Character[] boundaryChars,
int boundaryMaxScan,
Locale boundaryScannerLocale,
Map<String, Object> customHighlighterParams) {
this.highlighter = highlighter;
this.preTags = preTags;
Expand All @@ -56,6 +65,10 @@ public HighlightSettings(
this.scoreOrdered = scoreOrdered;
this.fragmenter = fragmenter;
this.discreteMultivalue = discreteMultivalue;
this.boundaryScanner = boundaryScanner;
this.boundaryChars = boundaryChars;
this.boundaryMaxScan = boundaryMaxScan;
this.boundaryScannerLocale = boundaryScannerLocale;
this.customHighlighterParams = customHighlighterParams;
}

Expand All @@ -71,6 +84,10 @@ public Builder toBuilder() {
.withScoreOrdered(this.scoreOrdered)
.withFragmenter(this.fragmenter)
.withDiscreteMultivalue(this.discreteMultivalue)
.withBoundaryScanner(this.boundaryScanner)
.withBoundaryChars(this.boundaryChars)
.withBoundaryMaxScan(this.boundaryMaxScan)
.withBoundaryScannerLocale(this.boundaryScannerLocale)
.withCustomHighlighterParams(this.customHighlighterParams);
}

Expand Down Expand Up @@ -114,6 +131,22 @@ public boolean getDiscreteMultivalue() {
return discreteMultivalue;
}

public String getBoundaryScanner() {
return boundaryScanner;
}

public Character[] getBoundaryChars() {
return boundaryChars;
}

public int getBoundaryMaxScan() {
return boundaryMaxScan;
}

public Locale getBoundaryScannerLocale() {
return boundaryScannerLocale;
}

public Map<String, Object> getCustomHighlighterParams() {
return customHighlighterParams;
}
Expand Down Expand Up @@ -144,6 +177,15 @@ public String toString() {
+ discreteMultivalue
+ ", customHighlighterParams="
+ customHighlighterParams
+ ", boundaryScanner='"
+ boundaryScanner
+ '\''
+ ", boundaryChars="
+ Arrays.toString(boundaryChars)
+ ", boundaryCharsMaxScan="
+ boundaryMaxScan
+ ", boundaryScannerLocale="
+ boundaryScannerLocale.toLanguageTag()
+ '}';
}

Expand All @@ -159,6 +201,10 @@ public static final class Builder {
private boolean scoreOrdered;
private String fragmenter;
private boolean discreteMultivalue;
private String boundaryScanner;
private Character[] boundaryChars;
private int boundaryMaxScan;
private Locale boundaryScannerLocale;
private Map<String, Object> customHighlighterParams;

public Builder() {}
Expand Down Expand Up @@ -213,6 +259,26 @@ public Builder withDiscreteMultivalue(boolean discreteMultivalue) {
return this;
}

public Builder withBoundaryScanner(String boundaryScanner) {
this.boundaryScanner = boundaryScanner;
return this;
}

public Builder withBoundaryChars(Character[] boundaryChars) {
this.boundaryChars = boundaryChars;
return this;
}

public Builder withBoundaryMaxScan(int boundaryMaxScan) {
this.boundaryMaxScan = boundaryMaxScan;
return this;
}

public Builder withBoundaryScannerLocale(Locale boundaryScannerLocale) {
this.boundaryScannerLocale = boundaryScannerLocale;
return this;
}

public Builder withCustomHighlighterParams(Map<String, Object> customHighlighterParams) {
this.customHighlighterParams = customHighlighterParams;
return this;
Expand All @@ -230,6 +296,10 @@ public HighlightSettings build() {
scoreOrdered,
fragmenter,
discreteMultivalue,
boundaryScanner,
boundaryChars,
boundaryMaxScan,
boundaryScannerLocale,
customHighlighterParams);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
import com.yelp.nrtsearch.server.utils.StructValueTransformer;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner;

/** Helper class to create {@link HighlightSettings} from a search request. */
public class HighlightUtils {
Expand All @@ -40,6 +42,10 @@ public class HighlightUtils {
private static final boolean DEFAULT_SCORE_ORDERED = true;
private static final boolean DEFAULT_FIELD_MATCH = false;
private static final boolean DEFAULT_DISCRETE_MULTIVALUE = false;
private static final Character[] DEFAULT_BOUNDARY_CHARS =
SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS;
private static final int DEFAULT_BOUNDARY_MAX_SCAN = SimpleBoundaryScanner.DEFAULT_MAX_SCAN;
private static final Locale DEFAULT_BOUNDARY_SCANNER_LOCALE = Locale.ROOT;
private static final QueryNodeMapper QUERY_NODE_MAPPER = QueryNodeMapper.getInstance();

/**
Expand Down Expand Up @@ -105,6 +111,27 @@ static Map<String, HighlightSettings> createPerFieldSettings(
settings.hasDiscreteMultivalue()
? settings.getDiscreteMultivalue().getValue()
: globalSettings.getDiscreteMultivalue())
.withBoundaryScanner(
settings.hasBoundaryScanner()
? settings.getBoundaryScanner().getValue()
: globalSettings.getBoundaryScanner())
.withBoundaryChars(
settings.hasBoundaryChars() && !settings.getBoundaryChars().getValue().isEmpty()
? settings
.getBoundaryChars()
.getValue()
.chars()
.mapToObj(c -> Character.valueOf((char) c))
.toArray(Character[]::new)
: globalSettings.getBoundaryChars())
.withBoundaryMaxScan(
settings.hasBoundaryMaxScan()
? settings.getBoundaryMaxScan().getValue()
: globalSettings.getBoundaryMaxScan())
.withBoundaryScannerLocale(
settings.hasBoundaryScannerLocale()
? Locale.forLanguageTag(settings.getBoundaryScannerLocale().getValue())
: globalSettings.getBoundaryScannerLocale())
.withCustomHighlighterParams(
settings.hasCustomHighlighterParams()
? StructValueTransformer.transformStruct(
Expand Down Expand Up @@ -166,6 +193,25 @@ private static HighlightSettings createGlobalFieldSettings(
settings.hasFragmentSize()
? settings.getFragmentSize().getValue()
: DEFAULT_FRAGMENT_SIZE)
.withBoundaryScanner(
settings.hasBoundaryScanner() ? settings.getBoundaryScanner().getValue() : null)
.withBoundaryChars(
settings.hasBoundaryChars() && !settings.getBoundaryChars().getValue().isEmpty()
? settings
.getBoundaryChars()
.getValue()
.chars()
.mapToObj(c -> Character.valueOf((char) c))
.toArray(Character[]::new)
: DEFAULT_BOUNDARY_CHARS)
.withBoundaryMaxScan(
settings.hasBoundaryMaxScan()
? settings.getBoundaryMaxScan().getValue()
: DEFAULT_BOUNDARY_MAX_SCAN)
.withBoundaryScannerLocale(
settings.hasBoundaryScannerLocale()
? Locale.forLanguageTag(settings.getBoundaryScannerLocale().getValue())
: DEFAULT_BOUNDARY_SCANNER_LOCALE)
.withCustomHighlighterParams(
settings.hasCustomHighlighterParams()
? StructValueTransformer.transformStruct(settings.getCustomHighlighterParams())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,19 @@
import com.yelp.nrtsearch.server.luceneserver.field.TextBaseFieldDef;
import com.yelp.nrtsearch.server.luceneserver.search.SearchContext;
import java.io.IOException;
import java.text.BreakIterator;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder;
import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.SingleFragListBuilder;
Expand Down Expand Up @@ -97,11 +101,29 @@ public String[] getHighlights(
fragListBuilder = SIMPLE_FRAG_LIST_BUILDER;
}

BoundaryScanner boundaryScanner;
if (settings.getBoundaryScanner() == null
|| settings.getBoundaryScanner().equalsIgnoreCase("simple")) {
boundaryScanner =
new SimpleBoundaryScanner(settings.getBoundaryMaxScan(), settings.getBoundaryChars());
} else if (settings.getBoundaryScanner().equalsIgnoreCase("word")) {
boundaryScanner =
new BreakIteratorBoundaryScanner(
BreakIterator.getWordInstance(settings.getBoundaryScannerLocale()));
} else if (settings.getBoundaryScanner().equalsIgnoreCase("sentence")) {
boundaryScanner =
new BreakIteratorBoundaryScanner(
BreakIterator.getSentenceInstance(settings.getBoundaryScannerLocale()));
} else {
throw new IllegalArgumentException(
"Unknown boundary scanner: " + settings.getBoundaryScanner());
}

BaseFragmentsBuilder fragmentsBuilder;
if (settings.isScoreOrdered()) {
fragmentsBuilder = new ScoreOrderFragmentsBuilder();
fragmentsBuilder = new ScoreOrderFragmentsBuilder(boundaryScanner);
} else {
fragmentsBuilder = new SimpleFragmentsBuilder();
fragmentsBuilder = new SimpleFragmentsBuilder(boundaryScanner);
}
fragmentsBuilder.setDiscreteMultiValueHighlighting(settings.getDiscreteMultivalue());

Expand Down
Loading

0 comments on commit 0aec087

Please sign in to comment.