From 0aec087cc083d07ea39802d1574b3ae2e19732d1 Mon Sep 17 00:00:00 2001 From: waziqi89 <89210409+waziqi89@users.noreply.github.com> Date: Thu, 22 Jun 2023 17:27:44 -0400 Subject: [PATCH] fvh boundary scanner customization (#580) fvh boundary scanner customization --- build.gradle | 2 +- .../main/proto/yelp/nrtsearch/search.proto | 8 + docs/highlighting.rst | 8 + grpc-gateway/luceneserver.swagger.json | 17 ++ grpc-gateway/search.pb.go | 197 +++++++++++------ .../highlights/HighlightSettings.java | 70 ++++++ .../highlights/HighlightUtils.java | 46 ++++ .../highlights/NRTFastVectorHighlighter.java | 26 ++- .../NRTFastVectorHighlighterTest.java | 199 ++++++++++++++++++ .../register_fields_highlights.json | 9 + 10 files changed, 510 insertions(+), 72 deletions(-) diff --git a/build.gradle b/build.gradle index 3c3368556..2e436bfb7 100644 --- a/build.gradle +++ b/build.gradle @@ -24,7 +24,7 @@ sourceCompatibility = 1.14 targetCompatibility = 1.14 allprojects { - version = '0.25.0' + version = '0.26.0' group = 'com.yelp.nrtsearch' } diff --git a/clientlib/src/main/proto/yelp/nrtsearch/search.proto b/clientlib/src/main/proto/yelp/nrtsearch/search.proto index 59f30fdf0..1fb05ea15 100644 --- a/clientlib/src/main/proto/yelp/nrtsearch/search.proto +++ b/clientlib/src/main/proto/yelp/nrtsearch/search.proto @@ -901,6 +901,14 @@ message Highlight { string custom_highlighter_name = 11; // Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen. google.protobuf.Struct custom_highlighter_params = 12; + // Define the boundary decision when creating fragments. Options are "simple" (default in fast vector highlighter), "word" or "sentence". + google.protobuf.StringValue boundary_scanner = 13; + // Terminating chars when using "simple" boundary_scanner. The default is ".,!? \t\n". + google.protobuf.StringValue boundary_chars = 14; + // Number of chars to scan before finding the boundary_chars if using "simple" boundary scanner; If "boundary_chars" is not found after max scan, fragments will start/end at the original place. Default is 20. + google.protobuf.UInt32Value boundary_max_scan = 15; + // Locale used in boundary scanner when using "word" or "sentence" boundary_scanner. Examples: "en-US", "ch-ZH". + google.protobuf.StringValue boundary_scanner_locale = 16; } // Highlight settings diff --git a/docs/highlighting.rst b/docs/highlighting.rst index 8d106e86c..b4f0f3fba 100644 --- a/docs/highlighting.rst +++ b/docs/highlighting.rst @@ -66,6 +66,14 @@ This is the proto definition for Highlight message which can be specified in Sea string custom_highlighter_name = 11; // Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen. google.protobuf.Struct custom_highlighter_params = 12; + // Define the boundary decision when creating fragments. Options are "boundary_chars" (default in fast vector highlighter), "word" or "sentence". + google.protobuf.StringValue boundary_scanner = 13; + // Terminating chars when using "boundary_chars" boundary_scanner. The default is ".,!? \t\n". + google.protobuf.StringValue boundary_chars = 14; + // Number of chars to scan before finding the boundary_chars if using "simple" boundary scanner; If "boundary_chars" is not found after max scan, fragments will start/end at the original place. Default is 20. + google.protobuf.UInt32Value boundary_max_scan = 15; + // Locale used in boundary scanner when using "word" or "sentence" boundary_scanner. Examples: "en-US", "ch-ZH". + google.protobuf.StringValue boundary_scanner_locale = 16; } // Highlight settings diff --git a/grpc-gateway/luceneserver.swagger.json b/grpc-gateway/luceneserver.swagger.json index d9aec16ff..2c7cb5caa 100644 --- a/grpc-gateway/luceneserver.swagger.json +++ b/grpc-gateway/luceneserver.swagger.json @@ -1696,6 +1696,23 @@ "custom_highlighter_params": { "type": "object", "description": "Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen." + }, + "boundary_scanner": { + "type": "string", + "description": "Define the boundary decision when creating fragments. Options are \"simple\" (default in fast vector highlighter), \"word\" or \"sentence\"." + }, + "boundary_chars": { + "type": "string", + "description": "Terminating chars when using \"simple\" boundary_scanner. The default is \".,!? \\t\\n\"." + }, + "boundary_max_scan": { + "type": "integer", + "format": "int64", + "description": "Number of chars to scan before finding the boundary_chars if using \"simple\" boundary scanner; If \"boundary_chars\" is not found after max scan, fragments will start/end at the original place. Default is 20." + }, + "boundary_scanner_locale": { + "type": "string", + "description": "Locale used in boundary scanner when using \"word\" or \"sentence\" boundary_scanner. Examples: \"en-US\", \"ch-ZH\"." } } }, diff --git a/grpc-gateway/search.pb.go b/grpc-gateway/search.pb.go index 04aaa607e..ee356acad 100644 --- a/grpc-gateway/search.pb.go +++ b/grpc-gateway/search.pb.go @@ -7091,6 +7091,14 @@ type Highlight_Settings struct { CustomHighlighterName string `protobuf:"bytes,11,opt,name=custom_highlighter_name,json=customHighlighterName,proto3" json:"custom_highlighter_name,omitempty"` // Optional Custom parameters for custom highlighters. If a field overriding is present, the global setting will be omitted for this field, and no merge will happen. CustomHighlighterParams *structpb.Struct `protobuf:"bytes,12,opt,name=custom_highlighter_params,json=customHighlighterParams,proto3" json:"custom_highlighter_params,omitempty"` + // Define the boundary decision when creating fragments. Options are "simple" (default in fast vector highlighter), "word" or "sentence". + BoundaryScanner *wrapperspb.StringValue `protobuf:"bytes,13,opt,name=boundary_scanner,json=boundaryScanner,proto3" json:"boundary_scanner,omitempty"` + // Terminating chars when using "simple" boundary_scanner. The default is ".,!? \t\n". + BoundaryChars *wrapperspb.StringValue `protobuf:"bytes,14,opt,name=boundary_chars,json=boundaryChars,proto3" json:"boundary_chars,omitempty"` + // Number of chars to scan before finding the boundary_chars if using "simple" boundary scanner; If "boundary_chars" is not found after max scan, fragments will start/end at the original place. Default is 20. + BoundaryMaxScan *wrapperspb.UInt32Value `protobuf:"bytes,15,opt,name=boundary_max_scan,json=boundaryMaxScan,proto3" json:"boundary_max_scan,omitempty"` + // Locale used in boundary scanner when using "word" or "sentence" boundary_scanner. Examples: "en-US", "ch-ZH". + BoundaryScannerLocale *wrapperspb.StringValue `protobuf:"bytes,16,opt,name=boundary_scanner_locale,json=boundaryScannerLocale,proto3" json:"boundary_scanner_locale,omitempty"` } func (x *Highlight_Settings) Reset() { @@ -7209,6 +7217,34 @@ func (x *Highlight_Settings) GetCustomHighlighterParams() *structpb.Struct { return nil } +func (x *Highlight_Settings) GetBoundaryScanner() *wrapperspb.StringValue { + if x != nil { + return x.BoundaryScanner + } + return nil +} + +func (x *Highlight_Settings) GetBoundaryChars() *wrapperspb.StringValue { + if x != nil { + return x.BoundaryChars + } + return nil +} + +func (x *Highlight_Settings) GetBoundaryMaxScan() *wrapperspb.UInt32Value { + if x != nil { + return x.BoundaryMaxScan + } + return nil +} + +func (x *Highlight_Settings) GetBoundaryScannerLocale() *wrapperspb.StringValue { + if x != nil { + return x.BoundaryScannerLocale + } + return nil +} + var File_yelp_nrtsearch_search_proto protoreflect.FileDescriptor var file_yelp_nrtsearch_search_proto_rawDesc = []byte{ @@ -8348,7 +8384,7 @@ var file_yelp_nrtsearch_search_proto_rawDesc = []byte{ 0x65, 0x79, 0x12, 0x33, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x6c, 0x75, 0x63, 0x65, 0x6e, 0x65, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x43, 0x6f, 0x6c, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, - 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0xcf, 0x08, 0x0a, 0x09, + 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0xfd, 0x0a, 0x0a, 0x09, 0x48, 0x69, 0x67, 0x68, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x12, 0x3c, 0x0a, 0x08, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x6c, 0x75, 0x63, 0x65, 0x6e, 0x65, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x48, 0x69, 0x67, 0x68, 0x6c, @@ -8360,7 +8396,7 @@ var file_yelp_nrtsearch_search_proto_rawDesc = []byte{ 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x48, 0x69, 0x67, 0x68, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0d, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x1a, 0xf7, 0x05, 0x0a, 0x08, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x12, + 0x67, 0x73, 0x1a, 0xa5, 0x08, 0x0a, 0x08, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x12, 0x47, 0x0a, 0x10, 0x68, 0x69, 0x67, 0x68, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x65, 0x72, 0x5f, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1c, 0x2e, 0x6c, 0x75, 0x63, 0x65, 0x6e, 0x65, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x48, 0x69, 0x67, 0x68, 0x6c, 0x69, 0x67, @@ -8407,67 +8443,86 @@ var file_yelp_nrtsearch_search_proto_rawDesc = []byte{ 0x72, 0x61, 0x6d, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x17, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x53, 0x74, 0x72, 0x75, 0x63, 0x74, 0x52, 0x17, 0x63, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x48, 0x69, 0x67, 0x68, 0x6c, - 0x69, 0x67, 0x68, 0x74, 0x65, 0x72, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x1a, 0x62, 0x0a, 0x12, - 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x45, 0x6e, 0x74, - 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, - 0x03, 0x6b, 0x65, 0x79, 0x12, 0x36, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, - 0x01, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x6c, 0x75, 0x63, 0x65, 0x6e, 0x65, 0x73, 0x65, 0x72, 0x76, - 0x65, 0x72, 0x2e, 0x48, 0x69, 0x67, 0x68, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x53, 0x65, 0x74, - 0x74, 0x69, 0x6e, 0x67, 0x73, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, - 0x22, 0x3b, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x44, 0x45, 0x46, 0x41, - 0x55, 0x4c, 0x54, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x46, 0x41, 0x53, 0x54, 0x5f, 0x56, 0x45, - 0x43, 0x54, 0x4f, 0x52, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x50, 0x4c, 0x41, 0x49, 0x4e, 0x10, - 0x02, 0x12, 0x0a, 0x0a, 0x06, 0x43, 0x55, 0x53, 0x54, 0x4f, 0x4d, 0x10, 0x03, 0x2a, 0x25, 0x0a, - 0x0d, 0x4d, 0x61, 0x74, 0x63, 0x68, 0x4f, 0x70, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x12, 0x0a, - 0x0a, 0x06, 0x53, 0x48, 0x4f, 0x55, 0x4c, 0x44, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4d, 0x55, - 0x53, 0x54, 0x10, 0x01, 0x2a, 0x95, 0x01, 0x0a, 0x0d, 0x52, 0x65, 0x77, 0x72, 0x69, 0x74, 0x65, - 0x4d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x4f, 0x4e, 0x53, 0x54, 0x41, - 0x4e, 0x54, 0x5f, 0x53, 0x43, 0x4f, 0x52, 0x45, 0x10, 0x00, 0x12, 0x1a, 0x0a, 0x16, 0x43, 0x4f, - 0x4e, 0x53, 0x54, 0x41, 0x4e, 0x54, 0x5f, 0x53, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x42, 0x4f, 0x4f, - 0x4c, 0x45, 0x41, 0x4e, 0x10, 0x01, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x43, 0x4f, 0x52, 0x49, 0x4e, - 0x47, 0x5f, 0x42, 0x4f, 0x4f, 0x4c, 0x45, 0x41, 0x4e, 0x10, 0x02, 0x12, 0x1b, 0x0a, 0x17, 0x54, - 0x4f, 0x50, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x53, 0x5f, 0x42, 0x4c, 0x45, 0x4e, 0x44, 0x45, 0x44, - 0x5f, 0x46, 0x52, 0x45, 0x51, 0x53, 0x10, 0x03, 0x12, 0x13, 0x0a, 0x0f, 0x54, 0x4f, 0x50, 0x5f, - 0x54, 0x45, 0x52, 0x4d, 0x53, 0x5f, 0x42, 0x4f, 0x4f, 0x53, 0x54, 0x10, 0x04, 0x12, 0x0d, 0x0a, - 0x09, 0x54, 0x4f, 0x50, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x53, 0x10, 0x05, 0x2a, 0x38, 0x0a, 0x13, - 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, - 0x79, 0x70, 0x65, 0x12, 0x10, 0x0a, 0x0c, 0x50, 0x52, 0x45, 0x46, 0x49, 0x58, 0x5f, 0x51, 0x55, - 0x45, 0x52, 0x59, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x46, 0x55, 0x5a, 0x5a, 0x59, 0x5f, 0x51, - 0x55, 0x45, 0x52, 0x59, 0x10, 0x01, 0x2a, 0x85, 0x03, 0x0a, 0x09, 0x51, 0x75, 0x65, 0x72, 0x79, - 0x54, 0x79, 0x70, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, 0x11, - 0x0a, 0x0d, 0x42, 0x4f, 0x4f, 0x4c, 0x45, 0x41, 0x4e, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, - 0x01, 0x12, 0x10, 0x0a, 0x0c, 0x50, 0x48, 0x52, 0x41, 0x53, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, - 0x59, 0x10, 0x02, 0x12, 0x18, 0x0a, 0x14, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, - 0x53, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x03, 0x12, 0x0e, 0x0a, - 0x0a, 0x54, 0x45, 0x52, 0x4d, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x04, 0x12, 0x15, 0x0a, - 0x11, 0x54, 0x45, 0x52, 0x4d, 0x5f, 0x49, 0x4e, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x51, 0x55, 0x45, - 0x52, 0x59, 0x10, 0x05, 0x12, 0x13, 0x0a, 0x0f, 0x44, 0x49, 0x53, 0x4a, 0x55, 0x4e, 0x43, 0x54, - 0x49, 0x4f, 0x4e, 0x5f, 0x4d, 0x41, 0x58, 0x10, 0x06, 0x12, 0x09, 0x0a, 0x05, 0x4d, 0x41, 0x54, - 0x43, 0x48, 0x10, 0x07, 0x12, 0x10, 0x0a, 0x0c, 0x4d, 0x41, 0x54, 0x43, 0x48, 0x5f, 0x50, 0x48, - 0x52, 0x41, 0x53, 0x45, 0x10, 0x08, 0x12, 0x0f, 0x0a, 0x0b, 0x4d, 0x55, 0x4c, 0x54, 0x49, 0x5f, - 0x4d, 0x41, 0x54, 0x43, 0x48, 0x10, 0x09, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x41, 0x4e, 0x47, 0x45, - 0x10, 0x0a, 0x12, 0x14, 0x0a, 0x10, 0x47, 0x45, 0x4f, 0x5f, 0x42, 0x4f, 0x55, 0x4e, 0x44, 0x49, - 0x4e, 0x47, 0x5f, 0x42, 0x4f, 0x58, 0x10, 0x0b, 0x12, 0x0d, 0x0a, 0x09, 0x47, 0x45, 0x4f, 0x5f, - 0x50, 0x4f, 0x49, 0x4e, 0x54, 0x10, 0x0c, 0x12, 0x0a, 0x0a, 0x06, 0x4e, 0x45, 0x53, 0x54, 0x45, - 0x44, 0x10, 0x0d, 0x12, 0x0a, 0x0a, 0x06, 0x45, 0x58, 0x49, 0x53, 0x54, 0x53, 0x10, 0x0e, 0x12, - 0x0e, 0x0a, 0x0a, 0x47, 0x45, 0x4f, 0x5f, 0x52, 0x41, 0x44, 0x49, 0x55, 0x53, 0x10, 0x0f, 0x12, - 0x0e, 0x0a, 0x0a, 0x43, 0x4f, 0x4d, 0x50, 0x4c, 0x45, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x10, 0x12, - 0x1e, 0x0a, 0x1a, 0x4d, 0x55, 0x4c, 0x54, 0x49, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, - 0x4e, 0x5f, 0x53, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x11, 0x12, - 0x17, 0x0a, 0x13, 0x4d, 0x41, 0x54, 0x43, 0x48, 0x5f, 0x50, 0x48, 0x52, 0x41, 0x53, 0x45, 0x5f, - 0x50, 0x52, 0x45, 0x46, 0x49, 0x58, 0x10, 0x12, 0x12, 0x0a, 0x0a, 0x06, 0x50, 0x52, 0x45, 0x46, - 0x49, 0x58, 0x10, 0x13, 0x12, 0x18, 0x0a, 0x14, 0x43, 0x4f, 0x4e, 0x53, 0x54, 0x41, 0x4e, 0x54, - 0x5f, 0x53, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x14, 0x2a, 0x3c, - 0x0a, 0x08, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x07, 0x0a, 0x03, 0x4d, 0x49, - 0x4e, 0x10, 0x00, 0x12, 0x07, 0x0a, 0x03, 0x4d, 0x41, 0x58, 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, - 0x4d, 0x49, 0x44, 0x44, 0x4c, 0x45, 0x5f, 0x4d, 0x49, 0x4e, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, - 0x4d, 0x49, 0x44, 0x44, 0x4c, 0x45, 0x5f, 0x4d, 0x41, 0x58, 0x10, 0x03, 0x42, 0x58, 0x0a, 0x1e, - 0x63, 0x6f, 0x6d, 0x2e, 0x79, 0x65, 0x6c, 0x70, 0x2e, 0x6e, 0x72, 0x74, 0x73, 0x65, 0x61, 0x72, - 0x63, 0x68, 0x2e, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x67, 0x72, 0x70, 0x63, 0x42, 0x13, - 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x50, 0x72, - 0x6f, 0x74, 0x6f, 0x50, 0x01, 0x5a, 0x19, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, - 0x6d, 0x2f, 0x59, 0x65, 0x6c, 0x70, 0x2f, 0x6e, 0x72, 0x74, 0x73, 0x65, 0x61, 0x72, 0x63, 0x68, - 0xa2, 0x02, 0x03, 0x48, 0x4c, 0x57, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x69, 0x67, 0x68, 0x74, 0x65, 0x72, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x12, 0x47, 0x0a, 0x10, + 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x5f, 0x73, 0x63, 0x61, 0x6e, 0x6e, 0x65, 0x72, + 0x18, 0x0d, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x56, + 0x61, 0x6c, 0x75, 0x65, 0x52, 0x0f, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x53, 0x63, + 0x61, 0x6e, 0x6e, 0x65, 0x72, 0x12, 0x43, 0x0a, 0x0e, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, + 0x79, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1c, 0x2e, + 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, + 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x52, 0x0d, 0x62, 0x6f, 0x75, + 0x6e, 0x64, 0x61, 0x72, 0x79, 0x43, 0x68, 0x61, 0x72, 0x73, 0x12, 0x48, 0x0a, 0x11, 0x62, 0x6f, + 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x5f, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x63, 0x61, 0x6e, 0x18, + 0x0f, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x55, 0x49, 0x6e, 0x74, 0x33, 0x32, 0x56, 0x61, + 0x6c, 0x75, 0x65, 0x52, 0x0f, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x4d, 0x61, 0x78, + 0x53, 0x63, 0x61, 0x6e, 0x12, 0x54, 0x0a, 0x17, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, + 0x5f, 0x73, 0x63, 0x61, 0x6e, 0x6e, 0x65, 0x72, 0x5f, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x18, + 0x10, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1c, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x56, 0x61, + 0x6c, 0x75, 0x65, 0x52, 0x15, 0x62, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x53, 0x63, 0x61, + 0x6e, 0x6e, 0x65, 0x72, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x1a, 0x62, 0x0a, 0x12, 0x46, 0x69, + 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, + 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, + 0x65, 0x79, 0x12, 0x36, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x0b, 0x32, 0x20, 0x2e, 0x6c, 0x75, 0x63, 0x65, 0x6e, 0x65, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, + 0x2e, 0x48, 0x69, 0x67, 0x68, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x53, 0x65, 0x74, 0x74, 0x69, + 0x6e, 0x67, 0x73, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x3b, + 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x44, 0x45, 0x46, 0x41, 0x55, 0x4c, + 0x54, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x46, 0x41, 0x53, 0x54, 0x5f, 0x56, 0x45, 0x43, 0x54, + 0x4f, 0x52, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x50, 0x4c, 0x41, 0x49, 0x4e, 0x10, 0x02, 0x12, + 0x0a, 0x0a, 0x06, 0x43, 0x55, 0x53, 0x54, 0x4f, 0x4d, 0x10, 0x03, 0x2a, 0x25, 0x0a, 0x0d, 0x4d, + 0x61, 0x74, 0x63, 0x68, 0x4f, 0x70, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x12, 0x0a, 0x0a, 0x06, + 0x53, 0x48, 0x4f, 0x55, 0x4c, 0x44, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x4d, 0x55, 0x53, 0x54, + 0x10, 0x01, 0x2a, 0x95, 0x01, 0x0a, 0x0d, 0x52, 0x65, 0x77, 0x72, 0x69, 0x74, 0x65, 0x4d, 0x65, + 0x74, 0x68, 0x6f, 0x64, 0x12, 0x12, 0x0a, 0x0e, 0x43, 0x4f, 0x4e, 0x53, 0x54, 0x41, 0x4e, 0x54, + 0x5f, 0x53, 0x43, 0x4f, 0x52, 0x45, 0x10, 0x00, 0x12, 0x1a, 0x0a, 0x16, 0x43, 0x4f, 0x4e, 0x53, + 0x54, 0x41, 0x4e, 0x54, 0x5f, 0x53, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x42, 0x4f, 0x4f, 0x4c, 0x45, + 0x41, 0x4e, 0x10, 0x01, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x43, 0x4f, 0x52, 0x49, 0x4e, 0x47, 0x5f, + 0x42, 0x4f, 0x4f, 0x4c, 0x45, 0x41, 0x4e, 0x10, 0x02, 0x12, 0x1b, 0x0a, 0x17, 0x54, 0x4f, 0x50, + 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x53, 0x5f, 0x42, 0x4c, 0x45, 0x4e, 0x44, 0x45, 0x44, 0x5f, 0x46, + 0x52, 0x45, 0x51, 0x53, 0x10, 0x03, 0x12, 0x13, 0x0a, 0x0f, 0x54, 0x4f, 0x50, 0x5f, 0x54, 0x45, + 0x52, 0x4d, 0x53, 0x5f, 0x42, 0x4f, 0x4f, 0x53, 0x54, 0x10, 0x04, 0x12, 0x0d, 0x0a, 0x09, 0x54, + 0x4f, 0x50, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x53, 0x10, 0x05, 0x2a, 0x38, 0x0a, 0x13, 0x43, 0x6f, + 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x79, 0x70, + 0x65, 0x12, 0x10, 0x0a, 0x0c, 0x50, 0x52, 0x45, 0x46, 0x49, 0x58, 0x5f, 0x51, 0x55, 0x45, 0x52, + 0x59, 0x10, 0x00, 0x12, 0x0f, 0x0a, 0x0b, 0x46, 0x55, 0x5a, 0x5a, 0x59, 0x5f, 0x51, 0x55, 0x45, + 0x52, 0x59, 0x10, 0x01, 0x2a, 0x85, 0x03, 0x0a, 0x09, 0x51, 0x75, 0x65, 0x72, 0x79, 0x54, 0x79, + 0x70, 0x65, 0x12, 0x08, 0x0a, 0x04, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, 0x11, 0x0a, 0x0d, + 0x42, 0x4f, 0x4f, 0x4c, 0x45, 0x41, 0x4e, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x01, 0x12, + 0x10, 0x0a, 0x0c, 0x50, 0x48, 0x52, 0x41, 0x53, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, + 0x02, 0x12, 0x18, 0x0a, 0x14, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x53, 0x43, + 0x4f, 0x52, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x03, 0x12, 0x0e, 0x0a, 0x0a, 0x54, + 0x45, 0x52, 0x4d, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x04, 0x12, 0x15, 0x0a, 0x11, 0x54, + 0x45, 0x52, 0x4d, 0x5f, 0x49, 0x4e, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, + 0x10, 0x05, 0x12, 0x13, 0x0a, 0x0f, 0x44, 0x49, 0x53, 0x4a, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, + 0x4e, 0x5f, 0x4d, 0x41, 0x58, 0x10, 0x06, 0x12, 0x09, 0x0a, 0x05, 0x4d, 0x41, 0x54, 0x43, 0x48, + 0x10, 0x07, 0x12, 0x10, 0x0a, 0x0c, 0x4d, 0x41, 0x54, 0x43, 0x48, 0x5f, 0x50, 0x48, 0x52, 0x41, + 0x53, 0x45, 0x10, 0x08, 0x12, 0x0f, 0x0a, 0x0b, 0x4d, 0x55, 0x4c, 0x54, 0x49, 0x5f, 0x4d, 0x41, + 0x54, 0x43, 0x48, 0x10, 0x09, 0x12, 0x09, 0x0a, 0x05, 0x52, 0x41, 0x4e, 0x47, 0x45, 0x10, 0x0a, + 0x12, 0x14, 0x0a, 0x10, 0x47, 0x45, 0x4f, 0x5f, 0x42, 0x4f, 0x55, 0x4e, 0x44, 0x49, 0x4e, 0x47, + 0x5f, 0x42, 0x4f, 0x58, 0x10, 0x0b, 0x12, 0x0d, 0x0a, 0x09, 0x47, 0x45, 0x4f, 0x5f, 0x50, 0x4f, + 0x49, 0x4e, 0x54, 0x10, 0x0c, 0x12, 0x0a, 0x0a, 0x06, 0x4e, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, + 0x0d, 0x12, 0x0a, 0x0a, 0x06, 0x45, 0x58, 0x49, 0x53, 0x54, 0x53, 0x10, 0x0e, 0x12, 0x0e, 0x0a, + 0x0a, 0x47, 0x45, 0x4f, 0x5f, 0x52, 0x41, 0x44, 0x49, 0x55, 0x53, 0x10, 0x0f, 0x12, 0x0e, 0x0a, + 0x0a, 0x43, 0x4f, 0x4d, 0x50, 0x4c, 0x45, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x10, 0x12, 0x1e, 0x0a, + 0x1a, 0x4d, 0x55, 0x4c, 0x54, 0x49, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, + 0x53, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x11, 0x12, 0x17, 0x0a, + 0x13, 0x4d, 0x41, 0x54, 0x43, 0x48, 0x5f, 0x50, 0x48, 0x52, 0x41, 0x53, 0x45, 0x5f, 0x50, 0x52, + 0x45, 0x46, 0x49, 0x58, 0x10, 0x12, 0x12, 0x0a, 0x0a, 0x06, 0x50, 0x52, 0x45, 0x46, 0x49, 0x58, + 0x10, 0x13, 0x12, 0x18, 0x0a, 0x14, 0x43, 0x4f, 0x4e, 0x53, 0x54, 0x41, 0x4e, 0x54, 0x5f, 0x53, + 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x51, 0x55, 0x45, 0x52, 0x59, 0x10, 0x14, 0x2a, 0x3c, 0x0a, 0x08, + 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x07, 0x0a, 0x03, 0x4d, 0x49, 0x4e, 0x10, + 0x00, 0x12, 0x07, 0x0a, 0x03, 0x4d, 0x41, 0x58, 0x10, 0x01, 0x12, 0x0e, 0x0a, 0x0a, 0x4d, 0x49, + 0x44, 0x44, 0x4c, 0x45, 0x5f, 0x4d, 0x49, 0x4e, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, 0x4d, 0x49, + 0x44, 0x44, 0x4c, 0x45, 0x5f, 0x4d, 0x41, 0x58, 0x10, 0x03, 0x42, 0x58, 0x0a, 0x1e, 0x63, 0x6f, + 0x6d, 0x2e, 0x79, 0x65, 0x6c, 0x70, 0x2e, 0x6e, 0x72, 0x74, 0x73, 0x65, 0x61, 0x72, 0x63, 0x68, + 0x2e, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x67, 0x72, 0x70, 0x63, 0x42, 0x13, 0x53, 0x65, + 0x61, 0x72, 0x63, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x50, 0x72, 0x6f, 0x74, + 0x6f, 0x50, 0x01, 0x5a, 0x19, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, + 0x59, 0x65, 0x6c, 0x70, 0x2f, 0x6e, 0x72, 0x74, 0x73, 0x65, 0x61, 0x72, 0x63, 0x68, 0xa2, 0x02, + 0x03, 0x48, 0x4c, 0x57, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -8766,12 +8821,16 @@ var file_yelp_nrtsearch_search_proto_depIdxs = []int32{ 116, // 159: luceneserver.Highlight.Settings.fragmenter:type_name -> google.protobuf.StringValue 115, // 160: luceneserver.Highlight.Settings.discrete_multivalue:type_name -> google.protobuf.BoolValue 111, // 161: luceneserver.Highlight.Settings.custom_highlighter_params:type_name -> google.protobuf.Struct - 107, // 162: luceneserver.Highlight.FieldSettingsEntry.value:type_name -> luceneserver.Highlight.Settings - 163, // [163:163] is the sub-list for method output_type - 163, // [163:163] is the sub-list for method input_type - 163, // [163:163] is the sub-list for extension type_name - 163, // [163:163] is the sub-list for extension extendee - 0, // [0:163] is the sub-list for field type_name + 116, // 162: luceneserver.Highlight.Settings.boundary_scanner:type_name -> google.protobuf.StringValue + 116, // 163: luceneserver.Highlight.Settings.boundary_chars:type_name -> google.protobuf.StringValue + 114, // 164: luceneserver.Highlight.Settings.boundary_max_scan:type_name -> google.protobuf.UInt32Value + 116, // 165: luceneserver.Highlight.Settings.boundary_scanner_locale:type_name -> google.protobuf.StringValue + 107, // 166: luceneserver.Highlight.FieldSettingsEntry.value:type_name -> luceneserver.Highlight.Settings + 167, // [167:167] is the sub-list for method output_type + 167, // [167:167] is the sub-list for method input_type + 167, // [167:167] is the sub-list for extension type_name + 167, // [167:167] is the sub-list for extension extendee + 0, // [0:167] is the sub-list for field type_name } func init() { file_yelp_nrtsearch_search_proto_init() } diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightSettings.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightSettings.java index c71619cea..eafd4dea6 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightSettings.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightSettings.java @@ -16,6 +16,7 @@ package com.yelp.nrtsearch.server.luceneserver.highlights; import java.util.Arrays; +import java.util.Locale; import java.util.Map; import org.apache.lucene.search.Query; @@ -33,6 +34,10 @@ public class HighlightSettings { private final String fragmenter; private final boolean discreteMultivalue; private final Map customHighlighterParams; + private final String boundaryScanner; + private final Character[] boundaryChars; + private final int boundaryMaxScan; + private final Locale boundaryScannerLocale; public HighlightSettings( Highlighter highlighter, @@ -45,6 +50,10 @@ public HighlightSettings( boolean scoreOrdered, String fragmenter, boolean discreteMultivalue, + String boundaryScanner, + Character[] boundaryChars, + int boundaryMaxScan, + Locale boundaryScannerLocale, Map customHighlighterParams) { this.highlighter = highlighter; this.preTags = preTags; @@ -56,6 +65,10 @@ public HighlightSettings( this.scoreOrdered = scoreOrdered; this.fragmenter = fragmenter; this.discreteMultivalue = discreteMultivalue; + this.boundaryScanner = boundaryScanner; + this.boundaryChars = boundaryChars; + this.boundaryMaxScan = boundaryMaxScan; + this.boundaryScannerLocale = boundaryScannerLocale; this.customHighlighterParams = customHighlighterParams; } @@ -71,6 +84,10 @@ public Builder toBuilder() { .withScoreOrdered(this.scoreOrdered) .withFragmenter(this.fragmenter) .withDiscreteMultivalue(this.discreteMultivalue) + .withBoundaryScanner(this.boundaryScanner) + .withBoundaryChars(this.boundaryChars) + .withBoundaryMaxScan(this.boundaryMaxScan) + .withBoundaryScannerLocale(this.boundaryScannerLocale) .withCustomHighlighterParams(this.customHighlighterParams); } @@ -114,6 +131,22 @@ public boolean getDiscreteMultivalue() { return discreteMultivalue; } + public String getBoundaryScanner() { + return boundaryScanner; + } + + public Character[] getBoundaryChars() { + return boundaryChars; + } + + public int getBoundaryMaxScan() { + return boundaryMaxScan; + } + + public Locale getBoundaryScannerLocale() { + return boundaryScannerLocale; + } + public Map getCustomHighlighterParams() { return customHighlighterParams; } @@ -144,6 +177,15 @@ public String toString() { + discreteMultivalue + ", customHighlighterParams=" + customHighlighterParams + + ", boundaryScanner='" + + boundaryScanner + + '\'' + + ", boundaryChars=" + + Arrays.toString(boundaryChars) + + ", boundaryCharsMaxScan=" + + boundaryMaxScan + + ", boundaryScannerLocale=" + + boundaryScannerLocale.toLanguageTag() + '}'; } @@ -159,6 +201,10 @@ public static final class Builder { private boolean scoreOrdered; private String fragmenter; private boolean discreteMultivalue; + private String boundaryScanner; + private Character[] boundaryChars; + private int boundaryMaxScan; + private Locale boundaryScannerLocale; private Map customHighlighterParams; public Builder() {} @@ -213,6 +259,26 @@ public Builder withDiscreteMultivalue(boolean discreteMultivalue) { return this; } + public Builder withBoundaryScanner(String boundaryScanner) { + this.boundaryScanner = boundaryScanner; + return this; + } + + public Builder withBoundaryChars(Character[] boundaryChars) { + this.boundaryChars = boundaryChars; + return this; + } + + public Builder withBoundaryMaxScan(int boundaryMaxScan) { + this.boundaryMaxScan = boundaryMaxScan; + return this; + } + + public Builder withBoundaryScannerLocale(Locale boundaryScannerLocale) { + this.boundaryScannerLocale = boundaryScannerLocale; + return this; + } + public Builder withCustomHighlighterParams(Map customHighlighterParams) { this.customHighlighterParams = customHighlighterParams; return this; @@ -230,6 +296,10 @@ public HighlightSettings build() { scoreOrdered, fragmenter, discreteMultivalue, + boundaryScanner, + boundaryChars, + boundaryMaxScan, + boundaryScannerLocale, customHighlighterParams); } } diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightUtils.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightUtils.java index 49d5819dc..51d50e894 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightUtils.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/HighlightUtils.java @@ -24,8 +24,10 @@ import com.yelp.nrtsearch.server.utils.StructValueTransformer; import java.util.Collections; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import org.apache.lucene.search.Query; +import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner; /** Helper class to create {@link HighlightSettings} from a search request. */ public class HighlightUtils { @@ -40,6 +42,10 @@ public class HighlightUtils { private static final boolean DEFAULT_SCORE_ORDERED = true; private static final boolean DEFAULT_FIELD_MATCH = false; private static final boolean DEFAULT_DISCRETE_MULTIVALUE = false; + private static final Character[] DEFAULT_BOUNDARY_CHARS = + SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS; + private static final int DEFAULT_BOUNDARY_MAX_SCAN = SimpleBoundaryScanner.DEFAULT_MAX_SCAN; + private static final Locale DEFAULT_BOUNDARY_SCANNER_LOCALE = Locale.ROOT; private static final QueryNodeMapper QUERY_NODE_MAPPER = QueryNodeMapper.getInstance(); /** @@ -105,6 +111,27 @@ static Map createPerFieldSettings( settings.hasDiscreteMultivalue() ? settings.getDiscreteMultivalue().getValue() : globalSettings.getDiscreteMultivalue()) + .withBoundaryScanner( + settings.hasBoundaryScanner() + ? settings.getBoundaryScanner().getValue() + : globalSettings.getBoundaryScanner()) + .withBoundaryChars( + settings.hasBoundaryChars() && !settings.getBoundaryChars().getValue().isEmpty() + ? settings + .getBoundaryChars() + .getValue() + .chars() + .mapToObj(c -> Character.valueOf((char) c)) + .toArray(Character[]::new) + : globalSettings.getBoundaryChars()) + .withBoundaryMaxScan( + settings.hasBoundaryMaxScan() + ? settings.getBoundaryMaxScan().getValue() + : globalSettings.getBoundaryMaxScan()) + .withBoundaryScannerLocale( + settings.hasBoundaryScannerLocale() + ? Locale.forLanguageTag(settings.getBoundaryScannerLocale().getValue()) + : globalSettings.getBoundaryScannerLocale()) .withCustomHighlighterParams( settings.hasCustomHighlighterParams() ? StructValueTransformer.transformStruct( @@ -166,6 +193,25 @@ private static HighlightSettings createGlobalFieldSettings( settings.hasFragmentSize() ? settings.getFragmentSize().getValue() : DEFAULT_FRAGMENT_SIZE) + .withBoundaryScanner( + settings.hasBoundaryScanner() ? settings.getBoundaryScanner().getValue() : null) + .withBoundaryChars( + settings.hasBoundaryChars() && !settings.getBoundaryChars().getValue().isEmpty() + ? settings + .getBoundaryChars() + .getValue() + .chars() + .mapToObj(c -> Character.valueOf((char) c)) + .toArray(Character[]::new) + : DEFAULT_BOUNDARY_CHARS) + .withBoundaryMaxScan( + settings.hasBoundaryMaxScan() + ? settings.getBoundaryMaxScan().getValue() + : DEFAULT_BOUNDARY_MAX_SCAN) + .withBoundaryScannerLocale( + settings.hasBoundaryScannerLocale() + ? Locale.forLanguageTag(settings.getBoundaryScannerLocale().getValue()) + : DEFAULT_BOUNDARY_SCANNER_LOCALE) .withCustomHighlighterParams( settings.hasCustomHighlighterParams() ? StructValueTransformer.transformStruct(settings.getCustomHighlighterParams()) diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighter.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighter.java index b57c29916..b26d26f24 100644 --- a/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighter.java +++ b/src/main/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighter.java @@ -18,15 +18,19 @@ import com.yelp.nrtsearch.server.luceneserver.field.TextBaseFieldDef; import com.yelp.nrtsearch.server.luceneserver.search.SearchContext; import java.io.IOException; +import java.text.BreakIterator; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder; +import org.apache.lucene.search.vectorhighlight.BoundaryScanner; +import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner; import org.apache.lucene.search.vectorhighlight.FieldQuery; import org.apache.lucene.search.vectorhighlight.FragListBuilder; import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder; +import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner; import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder; import org.apache.lucene.search.vectorhighlight.SimpleFragmentsBuilder; import org.apache.lucene.search.vectorhighlight.SingleFragListBuilder; @@ -97,11 +101,29 @@ public String[] getHighlights( fragListBuilder = SIMPLE_FRAG_LIST_BUILDER; } + BoundaryScanner boundaryScanner; + if (settings.getBoundaryScanner() == null + || settings.getBoundaryScanner().equalsIgnoreCase("simple")) { + boundaryScanner = + new SimpleBoundaryScanner(settings.getBoundaryMaxScan(), settings.getBoundaryChars()); + } else if (settings.getBoundaryScanner().equalsIgnoreCase("word")) { + boundaryScanner = + new BreakIteratorBoundaryScanner( + BreakIterator.getWordInstance(settings.getBoundaryScannerLocale())); + } else if (settings.getBoundaryScanner().equalsIgnoreCase("sentence")) { + boundaryScanner = + new BreakIteratorBoundaryScanner( + BreakIterator.getSentenceInstance(settings.getBoundaryScannerLocale())); + } else { + throw new IllegalArgumentException( + "Unknown boundary scanner: " + settings.getBoundaryScanner()); + } + BaseFragmentsBuilder fragmentsBuilder; if (settings.isScoreOrdered()) { - fragmentsBuilder = new ScoreOrderFragmentsBuilder(); + fragmentsBuilder = new ScoreOrderFragmentsBuilder(boundaryScanner); } else { - fragmentsBuilder = new SimpleFragmentsBuilder(); + fragmentsBuilder = new SimpleFragmentsBuilder(boundaryScanner); } fragmentsBuilder.setDiscreteMultiValueHighlighting(settings.getDiscreteMultivalue()); diff --git a/src/test/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighterTest.java b/src/test/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighterTest.java index 377efb2f7..de02c5fb1 100644 --- a/src/test/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighterTest.java +++ b/src/test/java/com/yelp/nrtsearch/server/luceneserver/highlights/NRTFastVectorHighlighterTest.java @@ -20,6 +20,7 @@ import static org.assertj.core.api.Assertions.fail; import com.google.protobuf.BoolValue; +import com.google.protobuf.StringValue; import com.google.protobuf.UInt32Value; import com.yelp.nrtsearch.server.grpc.AddDocumentRequest; import com.yelp.nrtsearch.server.grpc.AddDocumentRequest.MultiValuedField; @@ -33,6 +34,7 @@ import com.yelp.nrtsearch.server.grpc.SearchRequest; import com.yelp.nrtsearch.server.grpc.SearchResponse; import com.yelp.nrtsearch.server.grpc.SearchResponse.Hit; +import com.yelp.nrtsearch.server.grpc.TermQuery; import com.yelp.nrtsearch.server.luceneserver.ServerTestCase; import io.grpc.StatusRuntimeException; import io.grpc.testing.GrpcCleanupRule; @@ -79,6 +81,12 @@ protected void initIndex(String name) throws Exception { "I personally don't like the staff at this place", "Not all food are good.")) .build()) + .putFields( + "boundary_scanner_field", + MultiValuedField.newBuilder() + .addValue( + "This is a super longWordICouldEverImagineAndTheBoundaryScannerShouldProperlyHandle-it-in a very decent way and stops at.") + .build()) .build(); docs.add(request); request = @@ -449,6 +457,197 @@ public void testMaxFragmentSize() { .containsExactly("High quality food. Fresh and delicious!"); } + @Test + public void testBasicHighlightWithExplicitBoundaryScanner() { + Highlight highlight = + Highlight.newBuilder() + .addFields("comment") + .setSettings( + Settings.newBuilder() + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("simple"))) + .build(); + SearchResponse response = doHighlightQuery(highlight); + + assertFields(response); + + assertThat(response.getHits(0).getHighlightsMap().get("comment").getFragments(0)) + .isEqualTo("the food here is amazing, service was good"); + assertThat(response.getHits(1).getHighlightsMap().get("comment").getFragments(0)) + .isEqualTo( + "restaurant. The food here is pretty good, the service could be better. My favorite food was chilly chicken"); + assertThat(response.getDiagnostics().getHighlightTimeMs()).isGreaterThan(0); + } + + @Test + public void testBasicHighlightWithWrongBoundaryScanner() { + Highlight highlight = + Highlight.newBuilder() + .addFields("comment") + .setSettings( + Settings.newBuilder() + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("doesnt_exist"))) + .build(); + + assertThatThrownBy(() -> doHighlightQuery(highlight)) + .isInstanceOf(StatusRuntimeException.class) + .hasMessageContaining("Unknown boundary scanner"); + } + + @Test + public void testBasicHighlightWithCustomCharsBoundaryScanner() { + Highlight highlight = + Highlight.newBuilder() + .addFields("boundary_scanner_field") + .setSettings( + Settings.newBuilder() + .setHighlightQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("boundary_scanner_field") + .setTextValue("super") + .build()) + .build()) + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("simple")) + .setFragmentSize(UInt32Value.of(75)) + .setBoundaryChars(StringValue.of("-"))) + .build(); + SearchResponse response = doHighlightQuery(highlight); + + assertFields(response); + + assertThat(response.getHits(0).getHighlightsMap().get("boundary_scanner_field").getFragments(0)) + .isEqualTo( + "This is a super longWordICouldEverImagineAndTheBoundaryScannerShouldProperlyHandle"); + assertThat(response.getHits(1).getHighlightsCount()).isEqualTo(0); + assertThat(response.getDiagnostics().getHighlightTimeMs()).isGreaterThan(0); + } + + @Test + public void testBasicHighlightWithBoundaryScannerAndMaxScan() { + Highlight highlight = + Highlight.newBuilder() + .addFields("boundary_scanner_field") + .setSettings( + Settings.newBuilder() + .setHighlightQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("boundary_scanner_field") + .setTextValue("super") + .build()) + .build()) + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("simple")) + .setFragmentSize(UInt32Value.of(75)) + .setBoundaryMaxScan(UInt32Value.of(100))) + .build(); + SearchResponse response = doHighlightQuery(highlight); + + assertFields(response); + + assertThat(response.getHits(0).getHighlightsMap().get("boundary_scanner_field").getFragments(0)) + .isEqualTo( + "This is a super longWordICouldEverImagineAndTheBoundaryScannerShouldProperlyHandle-it-in"); + assertThat(response.getHits(1).getHighlightsCount()).isEqualTo(0); + assertThat(response.getDiagnostics().getHighlightTimeMs()).isGreaterThan(0); + } + + @Test + public void testBasicHighlightWithWordBoundaryScanner() { + Highlight highlight = + Highlight.newBuilder() + .addFields("boundary_scanner_field") + .setSettings( + Settings.newBuilder() + .setHighlightQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("boundary_scanner_field") + .setTextValue("super") + .build()) + .build()) + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("word")) + .setFragmentSize(UInt32Value.of(75))) + .build(); + SearchResponse response = doHighlightQuery(highlight); + + assertFields(response); + + assertThat(response.getHits(0).getHighlightsMap().get("boundary_scanner_field").getFragments(0)) + .isEqualTo( + "This is a super longWordICouldEverImagineAndTheBoundaryScannerShouldProperlyHandle-it-in"); + assertThat(response.getHits(1).getHighlightsCount()).isEqualTo(0); + assertThat(response.getDiagnostics().getHighlightTimeMs()).isGreaterThan(0); + } + + @Test + public void testBasicHighlightWithSentenceBoundaryScanner() { + Highlight highlight = + Highlight.newBuilder() + .addFields("boundary_scanner_field") + .setSettings( + Settings.newBuilder() + .setHighlightQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("boundary_scanner_field") + .setTextValue("super") + .build()) + .build()) + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("sentence")) + .setFragmentSize(UInt32Value.of(75))) + .build(); + SearchResponse response = doHighlightQuery(highlight); + + assertFields(response); + + assertThat(response.getHits(0).getHighlightsMap().get("boundary_scanner_field").getFragments(0)) + .isEqualTo( + "This is a super longWordICouldEverImagineAndTheBoundaryScannerShouldProperlyHandle-it-in a very decent way and stops at. "); + assertThat(response.getHits(1).getHighlightsCount()).isEqualTo(0); + assertThat(response.getDiagnostics().getHighlightTimeMs()).isGreaterThan(0); + } + + @Test + public void testBasicHighlightWithSentenceBoundaryScannerAndExplicitLocale() { + Highlight highlight = + Highlight.newBuilder() + .addFields("boundary_scanner_field") + .setSettings( + Settings.newBuilder() + .setHighlightQuery( + Query.newBuilder() + .setTermQuery( + TermQuery.newBuilder() + .setField("boundary_scanner_field") + .setTextValue("super") + .build()) + .build()) + .setScoreOrdered(BoolValue.of(true)) + .setBoundaryScanner(StringValue.of("sentence")) + .setBoundaryScannerLocale(StringValue.of("en-US")) + .setFragmentSize(UInt32Value.of(75))) + .build(); + SearchResponse response = doHighlightQuery(highlight); + + assertFields(response); + + assertThat(response.getHits(0).getHighlightsMap().get("boundary_scanner_field").getFragments(0)) + .isEqualTo( + "This is a super longWordICouldEverImagineAndTheBoundaryScannerShouldProperlyHandle-it-in a very decent way and stops at. "); + assertThat(response.getHits(1).getHighlightsCount()).isEqualTo(0); + assertThat(response.getDiagnostics().getHighlightTimeMs()).isGreaterThan(0); + } + private String indexName() { return getIndices().get(0); } diff --git a/src/test/resources/highlights/register_fields_highlights.json b/src/test/resources/highlights/register_fields_highlights.json index e1fe4cd8e..467ed09d8 100644 --- a/src/test/resources/highlights/register_fields_highlights.json +++ b/src/test/resources/highlights/register_fields_highlights.json @@ -60,6 +60,15 @@ "storeDocValues": true, "multiValued": true, "termVectors": "TERMS_POSITIONS_OFFSETS" + }, + { + "name": "boundary_scanner_field", + "type": "TEXT", + "search": true, + "store": true, + "tokenize": true, + "storeDocValues": true, + "termVectors": "TERMS_POSITIONS_OFFSETS" } ] }