diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go index 5efbea8654d..971cd439fb3 100644 --- a/go/mysql/icuregex/compiler.go +++ b/go/mysql/icuregex/compiler.go @@ -185,7 +185,7 @@ func (c *compiler) nextChar(ch *reChar) { // // We are in free-spacing and comments mode. // Scan through any white space and comments, until we - // reach a significant character or the end of inut. + // reach a significant character or the end of input. for { if ch.char == -1 { break // End of Input @@ -2049,7 +2049,7 @@ func (c *compiler) matchStartType() { currentLen = safeIncrement(currentLen, 1) atStart = false - case urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded. + case urxBackslashX, // Grapheme Cluster. Minimum is 1, max unbounded. urxDotanyAll, // . matches one or two. urxDotany, urxDotanyUnix: @@ -2893,7 +2893,7 @@ func (c *compiler) minMatchLength(start, end int) int32 { urxBackslashR, urxBackslashV, urcOnecharI, - urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded. + urxBackslashX, // Grapheme Cluster. Minimum is 1, max unbounded. urxDotanyAll, // . matches one or two. urxDotany, urxDotanyUnix: @@ -2983,7 +2983,7 @@ func (c *compiler) minMatchLength(start, end int) int32 { loc++ op = c.out.compiledPat[loc] if op.typ() == urxLaStart { - // The boilerplate for look-ahead includes two LA_END insturctions, + // The boilerplate for look-ahead includes two LA_END instructions, // Depth will be decremented by each one when it is seen. depth += 2 } @@ -3086,7 +3086,7 @@ func (c *compiler) maxMatchLength(start, end int) int32 { // Call the max length unbounded, and stop further checking. case urxBackref, // BackRef. Must assume that it might be a zero length match urxBackrefI, - urxBackslashX: // Grahpeme Cluster. Minimum is 1, max unbounded. + urxBackslashX: // Grapheme Cluster. Minimum is 1, max unbounded. currentLen = math.MaxInt32 // Ops that match a max of one character (possibly two 16 bit code units.) diff --git a/go/mysql/icuregex/error.go b/go/mysql/icuregex/error.go index 219ddcf602b..39c92399aa9 100644 --- a/go/mysql/icuregex/error.go +++ b/go/mysql/icuregex/error.go @@ -90,6 +90,8 @@ func (e *MatchError) Error() string { out.WriteString("Stack overflow") case TimeOut: out.WriteString("Timeout") + case InternalMatchError: + out.WriteString("Internal error") } input := e.Input @@ -144,6 +146,7 @@ const ( type MatchErrorCode int32 const ( - StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ - TimeOut /**< Maximum allowed match time exceeded */ + StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ + TimeOut /**< Maximum allowed match time exceeded */ + InternalMatchError /**< Internal error (bug) was detected. */ ) diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go index 42c98dde5db..9e9be505df7 100644 --- a/go/mysql/icuregex/icu_test.go +++ b/go/mysql/icuregex/icu_test.go @@ -181,9 +181,7 @@ func (tp *TestPattern) parseMatch(orig string) error { func ParseTestFile(t testing.TB, filename string) []TestPattern { f, err := os.Open(filename) - if err != nil { - t.Fatalf("failed to open test data: %v", err) - } + require.NoError(t, err) defer f.Close() scanner := bufio.NewScanner(f) @@ -229,9 +227,8 @@ func ParseTestFile(t testing.TB, filename string) []TestPattern { patterns = append(patterns, tp) } - if err := scanner.Err(); err != nil { - t.Fatal(err) - } + err = scanner.Err() + require.NoError(t, err) return patterns } @@ -394,9 +391,7 @@ func TestCornerCases(t *testing.T) { for _, tc := range cases { t.Run(tc.Pattern, func(t *testing.T) { _, err := icuregex.CompileString(tc.Pattern, tc.Flags) - if err != nil { - t.Fatal(err) - } + require.NoError(t, err) }) } } @@ -407,9 +402,7 @@ func TestOne(t *testing.T) { const Flags = 0 re, err := icuregex.CompileString(Pattern, Flags) - if err != nil { - t.Fatalf("compilation failed: %v", err) - } + require.NoError(t, err) re.Dump(os.Stderr) diff --git a/go/mysql/icuregex/internal/icudata/embed.go b/go/mysql/icuregex/internal/icudata/embed.go index bc3b62b5db6..12dbd5d0322 100644 --- a/go/mysql/icuregex/internal/icudata/embed.go +++ b/go/mysql/icuregex/internal/icudata/embed.go @@ -42,6 +42,11 @@ var UBidi []byte //go:embed ucase.icu var UCase []byte +// UEmoji is the list of Emoji properties. +// +//go:embed uemoji.icu +var UEmoji []byte + // ULayout is used for property checks agains the InPC, InSC // and VO properties. // diff --git a/go/mysql/icuregex/internal/icudata/nfc.nrm b/go/mysql/icuregex/internal/icudata/nfc.nrm index a1254c0aa75..2b0e972807e 100644 Binary files a/go/mysql/icuregex/internal/icudata/nfc.nrm and b/go/mysql/icuregex/internal/icudata/nfc.nrm differ diff --git a/go/mysql/icuregex/internal/icudata/nfkc.nrm b/go/mysql/icuregex/internal/icudata/nfkc.nrm index 2e6e3dda074..deffa3daa81 100644 Binary files a/go/mysql/icuregex/internal/icudata/nfkc.nrm and b/go/mysql/icuregex/internal/icudata/nfkc.nrm differ diff --git a/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm b/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm index a3a40833a91..3f8d756a0f4 100644 Binary files a/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm and b/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm differ diff --git a/go/mysql/icuregex/internal/icudata/pnames.icu b/go/mysql/icuregex/internal/icudata/pnames.icu index 58af6c0157a..c960dc00b49 100644 Binary files a/go/mysql/icuregex/internal/icudata/pnames.icu and b/go/mysql/icuregex/internal/icudata/pnames.icu differ diff --git a/go/mysql/icuregex/internal/icudata/ubidi.icu b/go/mysql/icuregex/internal/icudata/ubidi.icu index bc85f3d3502..cfde07406cc 100644 Binary files a/go/mysql/icuregex/internal/icudata/ubidi.icu and b/go/mysql/icuregex/internal/icudata/ubidi.icu differ diff --git a/go/mysql/icuregex/internal/icudata/ucase.icu b/go/mysql/icuregex/internal/icudata/ucase.icu index 011e6053f79..670b0827d55 100644 Binary files a/go/mysql/icuregex/internal/icudata/ucase.icu and b/go/mysql/icuregex/internal/icudata/ucase.icu differ diff --git a/go/mysql/icuregex/internal/icudata/uemoji.icu b/go/mysql/icuregex/internal/icudata/uemoji.icu new file mode 100644 index 00000000000..11fdf50ff18 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/uemoji.icu differ diff --git a/go/mysql/icuregex/internal/icudata/ulayout.icu b/go/mysql/icuregex/internal/icudata/ulayout.icu index 598d347cc1e..ca6d0013c08 100644 Binary files a/go/mysql/icuregex/internal/icudata/ulayout.icu and b/go/mysql/icuregex/internal/icudata/ulayout.icu differ diff --git a/go/mysql/icuregex/internal/icudata/unames.icu b/go/mysql/icuregex/internal/icudata/unames.icu index 55a2267fd5b..e271e78619f 100644 Binary files a/go/mysql/icuregex/internal/icudata/unames.icu and b/go/mysql/icuregex/internal/icudata/unames.icu differ diff --git a/go/mysql/icuregex/internal/icudata/uprops.icu b/go/mysql/icuregex/internal/icudata/uprops.icu index 245db9a0584..0cdd8dea636 100644 Binary files a/go/mysql/icuregex/internal/icudata/uprops.icu and b/go/mysql/icuregex/internal/icudata/uprops.icu differ diff --git a/go/mysql/icuregex/internal/ucase/loader.go b/go/mysql/icuregex/internal/ucase/loader.go index 83a6b6c59a7..2ac25cc0f6f 100644 --- a/go/mysql/icuregex/internal/ucase/loader.go +++ b/go/mysql/icuregex/internal/ucase/loader.go @@ -34,7 +34,6 @@ var ucaseOnce sync.Once var ucase struct { trie *utrie.UTrie2 exceptions []uint16 - unfold []uint16 } func trie() *utrie.UTrie2 { @@ -47,11 +46,6 @@ func exceptions() []uint16 { return ucase.exceptions } -func unfold() []uint16 { - loadUCase() - return ucase.unfold -} - func loadUCase() { ucaseOnce.Do(func() { b := udata.NewBytes(icudata.UCase) @@ -102,9 +96,6 @@ func readData(bytes *udata.Bytes) error { if n := indexes[ixExcLength]; n > 0 { ucase.exceptions = bytes.Uint16Slice(n) } - if n := indexes[ixUnfoldLength]; n > 0 { - ucase.unfold = bytes.Uint16Slice(n) - } return nil } diff --git a/go/mysql/icuregex/internal/uemoji/loader.go b/go/mysql/icuregex/internal/uemoji/loader.go new file mode 100644 index 00000000000..7015491d069 --- /dev/null +++ b/go/mysql/icuregex/internal/uemoji/loader.go @@ -0,0 +1,69 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uemoji + +import ( + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var uemojiOnce sync.Once +var uemoji struct { + trie *utrie.UcpTrie +} + +func loadUEmoji() { + uemojiOnce.Do(func() { + b := udata.NewBytes(icudata.UEmoji) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func trie() *utrie.UcpTrie { + loadUEmoji() + return uemoji.trie +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x45 && + info.DataFormat[1] == 0x6d && + info.DataFormat[2] == 0x6f && + info.DataFormat[3] == 0x6a && + info.FormatVersion[0] == 1 + }) + if err != nil { + return err + } + + bytes.Skip(bytes.Int32() - 4) + uemoji.trie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + return nil +} diff --git a/go/mysql/icuregex/internal/uemoji/uemoji.go b/go/mysql/icuregex/internal/uemoji/uemoji.go new file mode 100644 index 00000000000..5cc89acd69a --- /dev/null +++ b/go/mysql/icuregex/internal/uemoji/uemoji.go @@ -0,0 +1,82 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uemoji + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +type propertySet interface { + AddRune(ch rune) + AddRuneRange(from rune, to rune) +} + +func AddPropertyStarts(sa propertySet) { + // Add the start code point of each same-value range of the trie. + var start, end rune + for { + end, _ = trie().GetRange(start, utrie.UcpMapRangeNormal, 0, nil) + if end < 0 { + break + } + sa.AddRune(start) + start = end + 1 + } +} + +const ( + bitEmoji = 0 + bitEmojiPresentation = 1 + bitEmojiModifier = 2 + bitEmojiModifierBase = 3 + bitEmojiComponent = 4 + bitExtendedPictographic = 5 + bitBasicEmoji = 6 +) + +// Note: REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere. +var bitFlags = []int8{ + bitEmoji, + bitEmojiPresentation, + bitEmojiModifier, + bitEmojiModifierBase, + bitEmojiComponent, + -1, + -1, + bitExtendedPictographic, + bitBasicEmoji, + -1, + -1, + -1, + -1, + -1, + bitBasicEmoji, +} + +func HasBinaryProperty(c rune, which int) bool { + bit := bitFlags[which] + if bit < 0 { + return false // not a property that we support in this function + } + bits := trie().Get(c) + return ((bits >> bit) & 1) != 0 +} diff --git a/go/mysql/icuregex/internal/uprops/constants.go b/go/mysql/icuregex/internal/uprops/constants.go index 3cfe250599a..4cdf1ef8a0b 100644 --- a/go/mysql/icuregex/internal/uprops/constants.go +++ b/go/mysql/icuregex/internal/uprops/constants.go @@ -315,6 +315,56 @@ const ( */ UCharExtendedPictographic Property = 64 + /** + * Binary property of strings Basic_Emoji. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharBasicEmoji Property = 65 + /** + * Binary property of strings Emoji_Keycap_Sequence. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharEmojiKeycapSequence Property = 66 + /** + * Binary property of strings RGI_Emoji_Modifier_Sequence. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharRgiEmojiModifierSequence Property = 67 + /** + * Binary property of strings RGI_Emoji_Flag_Sequence. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharRgiEmojiFlagSequence Property = 68 + /** + * Binary property of strings RGI_Emoji_Tag_Sequence. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharRgiEmojiTagSequence Property = 69 + /** + * Binary property of strings RGI_Emoji_ZWJ_Sequence. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharRgiEmojiZwjSequence Property = 70 + /** + * Binary property of strings RGI_Emoji. + * See https://www.unicode.org/reports/tr51/#Emoji_Sets + * + * @stable ICU 70 + */ + UCharRgiEmoji Property = 71 + /** Enumerated property Bidi_Class. Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ UCharBidiClass Property = 0x1000 @@ -492,7 +542,7 @@ const ( ) const ( - uCharBinaryLimit = 65 + uCharBinaryLimit = 72 uCharIntLimit = 0x1019 uCharMaskLimit = 0x2001 uCharStringLimit = 0x400E @@ -595,6 +645,7 @@ const ( srcInpc srcInsc srcVo + srcEmoji ) const ( diff --git a/go/mysql/icuregex/internal/uprops/properties.go b/go/mysql/icuregex/internal/uprops/properties.go index d951cdc117a..954fc920f6c 100644 --- a/go/mysql/icuregex/internal/uprops/properties.go +++ b/go/mysql/icuregex/internal/uprops/properties.go @@ -32,6 +32,7 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" "vitess.io/vitess/go/mysql/icuregex/internal/ucase" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uemoji" "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" "vitess.io/vitess/go/mysql/icuregex/internal/unames" "vitess.io/vitess/go/mysql/icuregex/internal/uset" @@ -74,6 +75,8 @@ func getInclusionsForSource(src propertySource) (*uset.UnicodeSet, error) { ubidi.AddPropertyStarts(u) case srcInpc, srcInsc, srcVo: AddULayoutPropertyStarts(src, u) + case srcEmoji: + uemoji.AddPropertyStarts(u) default: return nil, errors.ErrUnsupported } diff --git a/go/mysql/icuregex/internal/uprops/uprops_binary.go b/go/mysql/icuregex/internal/uprops/uprops_binary.go index c85a24346c6..5d4aaaec1b5 100644 --- a/go/mysql/icuregex/internal/uprops/uprops_binary.go +++ b/go/mysql/icuregex/internal/uprops/uprops_binary.go @@ -28,6 +28,7 @@ import ( "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" "vitess.io/vitess/go/mysql/icuregex/internal/ucase" "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uemoji" ) type binaryProperty struct { @@ -108,14 +109,21 @@ var binProps = [uCharBinaryLimit]*binaryProperty{ {srcCaseAndNorm, 0, changesWhenCasefolded}, {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_CASEMAPPED {srcNfkcCf, 0, nil}, // Changes_When_NFKC_Casefolded is currently unsupported - {2, uchar.Mask(p2Emoji), defaultContains}, - {2, uchar.Mask(p2EmojiPresentation), defaultContains}, - {2, uchar.Mask(p2EmojiModifier), defaultContains}, - {2, uchar.Mask(p2EmojiModifierBase), defaultContains}, - {2, uchar.Mask(p2EmojiComponent), defaultContains}, + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EMOJI + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EMOJI_PRESENTATION + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EMOJI_MODIFIER + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EMOJI_MODIFIER_BASE + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EMOJI_COMPONENT {2, 0, isRegionalIndicator}, {1, uchar.Mask(pPrependedConcatenationMark), defaultContains}, - {2, uchar.Mask(p2ExtendedPictographic), defaultContains}, + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EXTENDED_PICTOGRAPHIC + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_BASIC_EMOJI + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_EMOJI_KEYCAP_SEQUENCE + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_RGI_EMOJI_TAG_SEQUENCE + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE + {srcEmoji, 0, hasEmojiProperty}, // UCHAR_RGI_EMOJI } func isBidiControl(_ *binaryProperty, c rune, _ Property) bool { @@ -232,3 +240,10 @@ func HasBinaryProperty(c rune, which Property) bool { } return prop.contains(prop, c, which) } + +func hasEmojiProperty(_ *binaryProperty, c rune, which Property) bool { + if which < UCharEmoji || UCharRgiEmoji < which { + return false + } + return uemoji.HasBinaryProperty(c, int(which-UCharEmoji)) +} diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go index 11fbc152d73..1b5495f495f 100644 --- a/go/mysql/icuregex/matcher.go +++ b/go/mysql/icuregex/matcher.go @@ -1194,7 +1194,15 @@ func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { default: // Trouble. The compiled pattern contains an entry with an // unrecognized type tag. - panic("unreachable") + // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have + // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. + // See ICU-21669. + return &MatchError{ + Code: InternalMatchError, + Pattern: m.pattern.pattern, + Position: *fp.inputIdx(), + Input: m.input, + } } } @@ -1549,7 +1557,15 @@ func (m *Matcher) Find() (bool, error) { } } default: - panic("unreachable") + // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But + // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. + // See ICU-21669. + return false, &MatchError{ + Code: InternalMatchError, + Pattern: m.pattern.pattern, + Position: startPos, + Input: m.input, + } } } diff --git a/go/mysql/icuregex/pattern.go b/go/mysql/icuregex/pattern.go index f0823a213d4..90e69b3f55d 100644 --- a/go/mysql/icuregex/pattern.go +++ b/go/mysql/icuregex/pattern.go @@ -59,14 +59,6 @@ func NewPattern(flags RegexpFlag) *Pattern { } } -func MustCompileString(in string, flags RegexpFlag) *Pattern { - pat, err := CompileString(in, flags) - if err != nil { - panic(err) - } - return pat -} - func Compile(in []rune, flags RegexpFlag) (*Pattern, error) { pat := NewPattern(flags) cmp := newCompiler(pat) @@ -77,12 +69,7 @@ func Compile(in []rune, flags RegexpFlag) (*Pattern, error) { } func CompileString(in string, flags RegexpFlag) (*Pattern, error) { - pat := NewPattern(flags) - cmp := newCompiler(pat) - if err := cmp.compile([]rune(in)); err != nil { - return nil, err - } - return pat, nil + return Compile([]rune(in), flags) } func (p *Pattern) Match(input string) *Matcher { diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go index 0e7beda9fbd..e8dfc95d6b0 100644 --- a/go/mysql/icuregex/perl_test.go +++ b/go/mysql/icuregex/perl_test.go @@ -27,22 +27,25 @@ import ( "strconv" "strings" "testing" + + "github.com/stretchr/testify/require" ) func TestPerl(t *testing.T) { f, err := os.Open("testdata/re_tests.txt") - if err != nil { - t.Fatalf("failed to open test data: %v", err) - } + require.NoError(t, err) defer f.Close() - flagPat := MustCompileString(`('?)(.*)\1(.*)`, 0) + flagPat, err := CompileString(`('?)(.*)\1(.*)`, 0) + require.NoError(t, err) flagMat := NewMatcher(flagPat) - groupsPat := MustCompileString(`\$([+\-])\[(\d+)\]`, 0) + groupsPat, err := CompileString(`\$([+\-])\[(\d+)\]`, 0) + require.NoError(t, err) groupsMat := NewMatcher(groupsPat) - cgPat := MustCompileString(`\$(\d+)`, 0) + cgPat, err := CompileString(`\$(\d+)`, 0) + require.NoError(t, err) cgMat := NewMatcher(cgPat) group := func(m *Matcher, idx int) string { @@ -52,9 +55,7 @@ func TestPerl(t *testing.T) { lookingAt := func(m *Matcher) bool { ok, err := m.LookingAt() - if err != nil { - t.Fatalf("failed to match with LookingAt(): %v", err) - } + require.NoError(t, err) return ok } @@ -73,9 +74,7 @@ func TestPerl(t *testing.T) { flagMat.ResetString(fields[0]) ok, _ := flagMat.Matches() - if !ok { - t.Fatalf("could not match pattern+flags (line %d)", lineno) - } + require.Truef(t, ok, "could not match pattern+flags (line %d)", lineno) pattern, _ := flagMat.Group(2) pattern = replacer.Replace(pattern) @@ -142,9 +141,7 @@ func TestPerl(t *testing.T) { case lookingAt(groupsMat): groupNum, err := strconv.ParseInt(group(groupsMat, 2), 10, 32) - if err != nil { - t.Fatalf("failed to parse Perl pattern: %v", err) - } + require.NoError(t, err) var matchPosition int if group(groupsMat, 1) == "+" { @@ -160,9 +157,7 @@ func TestPerl(t *testing.T) { case lookingAt(cgMat): groupNum, err := strconv.ParseInt(group(cgMat, 1), 10, 32) - if err != nil { - t.Fatalf("failed to parse Perl pattern: %v", err) - } + require.NoError(t, err) result = append(result, group(testMat, int(groupNum))...) perlExpr = perlExpr[cgMat.EndForGroup(0):] diff --git a/go/mysql/icuregex/sets_test.go b/go/mysql/icuregex/sets_test.go index d33552732f2..58da9882701 100644 --- a/go/mysql/icuregex/sets_test.go +++ b/go/mysql/icuregex/sets_test.go @@ -23,18 +23,20 @@ package icuregex import ( "testing" + + "github.com/stretchr/testify/assert" ) func TestStaticSetContents(t *testing.T) { - // These are the number of codepoints contained in each of the static sets as of ICU69-1, + // These are the number of codepoints contained in each of the static sets as of ICU73-2, // as to sanity check that we're re-creating the sets properly. // This table must be re-created when updating Unicode versions. var ExpectedSetSizes = map[int]int{ - 1: 134564, + 1: 139612, 4: 25, - 5: 1102451, - 6: 1979, - 7: 131, + 5: 1102442, + 6: 2125, + 7: 140, 8: 125, 9: 399, 10: 10773, @@ -43,9 +45,7 @@ func TestStaticSetContents(t *testing.T) { } for setid, expected := range ExpectedSetSizes { - if got := staticPropertySets[setid].Len(); got != expected { - t.Fatalf("static set [%d] has wrong size: got %d, expected %d", setid, got, expected) - } + assert.Equalf(t, expected, staticPropertySets[setid].Len(), "static set [%d] has wrong size", setid) } } diff --git a/go/mysql/icuregex/testdata/regextst_extended.txt b/go/mysql/icuregex/testdata/regextst_extended.txt index 841e5e46092..c6b567931e3 100644 --- a/go/mysql/icuregex/testdata/regextst_extended.txt +++ b/go/mysql/icuregex/testdata/regextst_extended.txt @@ -123,4 +123,6 @@ # Currently unsupported property classes below. They require # significant additional code to support. "\p{Changes_When_NFKC_Casefolded}" E "foo<0>\uFB03bar" -"\p{Segment_Starter}" E "<0>\uFB03Goodbye" \ No newline at end of file +"\p{Segment_Starter}" E "<0>\uFB03Goodbye" + +"\p{Emoji}" "foo<0>😀bar" \ No newline at end of file