Skip to content

Commit

Permalink
icuregex: Update to ICU 73 (vitessio#13912)
Browse files Browse the repository at this point in the history
Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>
  • Loading branch information
dbussink authored Sep 4, 2023
1 parent 1126480 commit 245670f
Show file tree
Hide file tree
Showing 25 changed files with 290 additions and 78 deletions.
10 changes: 5 additions & 5 deletions go/mysql/icuregex/compiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (c *compiler) nextChar(ch *reChar) {
//
// We are in free-spacing and comments mode.
// Scan through any white space and comments, until we
// reach a significant character or the end of inut.
// reach a significant character or the end of input.
for {
if ch.char == -1 {
break // End of Input
Expand Down Expand Up @@ -2049,7 +2049,7 @@ func (c *compiler) matchStartType() {
currentLen = safeIncrement(currentLen, 1)
atStart = false

case urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded.
case urxBackslashX, // Grapheme Cluster. Minimum is 1, max unbounded.
urxDotanyAll, // . matches one or two.
urxDotany,
urxDotanyUnix:
Expand Down Expand Up @@ -2893,7 +2893,7 @@ func (c *compiler) minMatchLength(start, end int) int32 {
urxBackslashR,
urxBackslashV,
urcOnecharI,
urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded.
urxBackslashX, // Grapheme Cluster. Minimum is 1, max unbounded.
urxDotanyAll, // . matches one or two.
urxDotany,
urxDotanyUnix:
Expand Down Expand Up @@ -2983,7 +2983,7 @@ func (c *compiler) minMatchLength(start, end int) int32 {
loc++
op = c.out.compiledPat[loc]
if op.typ() == urxLaStart {
// The boilerplate for look-ahead includes two LA_END insturctions,
// The boilerplate for look-ahead includes two LA_END instructions,
// Depth will be decremented by each one when it is seen.
depth += 2
}
Expand Down Expand Up @@ -3086,7 +3086,7 @@ func (c *compiler) maxMatchLength(start, end int) int32 {
// Call the max length unbounded, and stop further checking.
case urxBackref, // BackRef. Must assume that it might be a zero length match
urxBackrefI,
urxBackslashX: // Grahpeme Cluster. Minimum is 1, max unbounded.
urxBackslashX: // Grapheme Cluster. Minimum is 1, max unbounded.
currentLen = math.MaxInt32

// Ops that match a max of one character (possibly two 16 bit code units.)
Expand Down
7 changes: 5 additions & 2 deletions go/mysql/icuregex/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ func (e *MatchError) Error() string {
out.WriteString("Stack overflow")
case TimeOut:
out.WriteString("Timeout")
case InternalMatchError:
out.WriteString("Internal error")
}

input := e.Input
Expand Down Expand Up @@ -144,6 +146,7 @@ const (
type MatchErrorCode int32

const (
StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */
TimeOut /**< Maximum allowed match time exceeded */
StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */
TimeOut /**< Maximum allowed match time exceeded */
InternalMatchError /**< Internal error (bug) was detected. */
)
17 changes: 5 additions & 12 deletions go/mysql/icuregex/icu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,7 @@ func (tp *TestPattern) parseMatch(orig string) error {

func ParseTestFile(t testing.TB, filename string) []TestPattern {
f, err := os.Open(filename)
if err != nil {
t.Fatalf("failed to open test data: %v", err)
}
require.NoError(t, err)

defer f.Close()
scanner := bufio.NewScanner(f)
Expand Down Expand Up @@ -229,9 +227,8 @@ func ParseTestFile(t testing.TB, filename string) []TestPattern {
patterns = append(patterns, tp)
}

if err := scanner.Err(); err != nil {
t.Fatal(err)
}
err = scanner.Err()
require.NoError(t, err)
return patterns
}

Expand Down Expand Up @@ -394,9 +391,7 @@ func TestCornerCases(t *testing.T) {
for _, tc := range cases {
t.Run(tc.Pattern, func(t *testing.T) {
_, err := icuregex.CompileString(tc.Pattern, tc.Flags)
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
})
}
}
Expand All @@ -407,9 +402,7 @@ func TestOne(t *testing.T) {
const Flags = 0

re, err := icuregex.CompileString(Pattern, Flags)
if err != nil {
t.Fatalf("compilation failed: %v", err)
}
require.NoError(t, err)

re.Dump(os.Stderr)

Expand Down
5 changes: 5 additions & 0 deletions go/mysql/icuregex/internal/icudata/embed.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ var UBidi []byte
//go:embed ucase.icu
var UCase []byte

// UEmoji is the list of Emoji properties.
//
//go:embed uemoji.icu
var UEmoji []byte

// ULayout is used for property checks agains the InPC, InSC
// and VO properties.
//
Expand Down
Binary file modified go/mysql/icuregex/internal/icudata/nfc.nrm
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/nfkc.nrm
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/nfkc_cf.nrm
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/pnames.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/ubidi.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/ucase.icu
Binary file not shown.
Binary file added go/mysql/icuregex/internal/icudata/uemoji.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/ulayout.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/unames.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/uprops.icu
Binary file not shown.
9 changes: 0 additions & 9 deletions go/mysql/icuregex/internal/ucase/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ var ucaseOnce sync.Once
var ucase struct {
trie *utrie.UTrie2
exceptions []uint16
unfold []uint16
}

func trie() *utrie.UTrie2 {
Expand All @@ -47,11 +46,6 @@ func exceptions() []uint16 {
return ucase.exceptions
}

func unfold() []uint16 {
loadUCase()
return ucase.unfold
}

func loadUCase() {
ucaseOnce.Do(func() {
b := udata.NewBytes(icudata.UCase)
Expand Down Expand Up @@ -102,9 +96,6 @@ func readData(bytes *udata.Bytes) error {
if n := indexes[ixExcLength]; n > 0 {
ucase.exceptions = bytes.Uint16Slice(n)
}
if n := indexes[ixUnfoldLength]; n > 0 {
ucase.unfold = bytes.Uint16Slice(n)
}

return nil
}
69 changes: 69 additions & 0 deletions go/mysql/icuregex/internal/uemoji/loader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
© 2016 and later: Unicode, Inc. and others.
Copyright (C) 2004-2015, International Business Machines Corporation and others.
Copyright 2023 The Vitess Authors.
This file contains code derived from the Unicode Project's ICU library.
License & terms of use for the original code: http://www.unicode.org/copyright.html
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package uemoji

import (
"sync"

"vitess.io/vitess/go/mysql/icuregex/internal/icudata"
"vitess.io/vitess/go/mysql/icuregex/internal/udata"
"vitess.io/vitess/go/mysql/icuregex/internal/utrie"
)

var uemojiOnce sync.Once
var uemoji struct {
trie *utrie.UcpTrie
}

func loadUEmoji() {
uemojiOnce.Do(func() {
b := udata.NewBytes(icudata.UEmoji)
if err := readData(b); err != nil {
panic(err)
}
})
}

func trie() *utrie.UcpTrie {
loadUEmoji()
return uemoji.trie
}

func readData(bytes *udata.Bytes) error {
err := bytes.ReadHeader(func(info *udata.DataInfo) bool {
return info.DataFormat[0] == 0x45 &&
info.DataFormat[1] == 0x6d &&
info.DataFormat[2] == 0x6f &&
info.DataFormat[3] == 0x6a &&
info.FormatVersion[0] == 1
})
if err != nil {
return err
}

bytes.Skip(bytes.Int32() - 4)
uemoji.trie, err = utrie.UcpTrieFromBytes(bytes)
if err != nil {
return err
}
return nil
}
82 changes: 82 additions & 0 deletions go/mysql/icuregex/internal/uemoji/uemoji.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
© 2016 and later: Unicode, Inc. and others.
Copyright (C) 2004-2015, International Business Machines Corporation and others.
Copyright 2023 The Vitess Authors.
This file contains code derived from the Unicode Project's ICU library.
License & terms of use for the original code: http://www.unicode.org/copyright.html
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package uemoji

import (
"vitess.io/vitess/go/mysql/icuregex/internal/utrie"
)

type propertySet interface {
AddRune(ch rune)
AddRuneRange(from rune, to rune)
}

func AddPropertyStarts(sa propertySet) {
// Add the start code point of each same-value range of the trie.
var start, end rune
for {
end, _ = trie().GetRange(start, utrie.UcpMapRangeNormal, 0, nil)
if end < 0 {
break
}
sa.AddRune(start)
start = end + 1
}
}

const (
bitEmoji = 0
bitEmojiPresentation = 1
bitEmojiModifier = 2
bitEmojiModifierBase = 3
bitEmojiComponent = 4
bitExtendedPictographic = 5
bitBasicEmoji = 6
)

// Note: REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.
var bitFlags = []int8{
bitEmoji,
bitEmojiPresentation,
bitEmojiModifier,
bitEmojiModifierBase,
bitEmojiComponent,
-1,
-1,
bitExtendedPictographic,
bitBasicEmoji,
-1,
-1,
-1,
-1,
-1,
bitBasicEmoji,
}

func HasBinaryProperty(c rune, which int) bool {
bit := bitFlags[which]
if bit < 0 {
return false // not a property that we support in this function
}
bits := trie().Get(c)
return ((bits >> bit) & 1) != 0
}
53 changes: 52 additions & 1 deletion go/mysql/icuregex/internal/uprops/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,56 @@ const (
*/
UCharExtendedPictographic Property = 64

/**
* Binary property of strings Basic_Emoji.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharBasicEmoji Property = 65
/**
* Binary property of strings Emoji_Keycap_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharEmojiKeycapSequence Property = 66
/**
* Binary property of strings RGI_Emoji_Modifier_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiModifierSequence Property = 67
/**
* Binary property of strings RGI_Emoji_Flag_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiFlagSequence Property = 68
/**
* Binary property of strings RGI_Emoji_Tag_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiTagSequence Property = 69
/**
* Binary property of strings RGI_Emoji_ZWJ_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiZwjSequence Property = 70
/**
* Binary property of strings RGI_Emoji.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmoji Property = 71

/** Enumerated property Bidi_Class.
Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */
UCharBidiClass Property = 0x1000
Expand Down Expand Up @@ -492,7 +542,7 @@ const (
)

const (
uCharBinaryLimit = 65
uCharBinaryLimit = 72
uCharIntLimit = 0x1019
uCharMaskLimit = 0x2001
uCharStringLimit = 0x400E
Expand Down Expand Up @@ -595,6 +645,7 @@ const (
srcInpc
srcInsc
srcVo
srcEmoji
)

const (
Expand Down
3 changes: 3 additions & 0 deletions go/mysql/icuregex/internal/uprops/properties.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"vitess.io/vitess/go/mysql/icuregex/internal/ubidi"
"vitess.io/vitess/go/mysql/icuregex/internal/ucase"
"vitess.io/vitess/go/mysql/icuregex/internal/uchar"
"vitess.io/vitess/go/mysql/icuregex/internal/uemoji"
"vitess.io/vitess/go/mysql/icuregex/internal/ulayout"
"vitess.io/vitess/go/mysql/icuregex/internal/unames"
"vitess.io/vitess/go/mysql/icuregex/internal/uset"
Expand Down Expand Up @@ -74,6 +75,8 @@ func getInclusionsForSource(src propertySource) (*uset.UnicodeSet, error) {
ubidi.AddPropertyStarts(u)
case srcInpc, srcInsc, srcVo:
AddULayoutPropertyStarts(src, u)
case srcEmoji:
uemoji.AddPropertyStarts(u)
default:
return nil, errors.ErrUnsupported
}
Expand Down
Loading

0 comments on commit 245670f

Please sign in to comment.