Skip to content

Commit

Permalink
mysql: introduce icuregex package (vitessio#13391)
Browse files Browse the repository at this point in the history
* mysql: introduce icuregex package

Co-authored-by: Dirkjan Bussink <d.bussink@gmail.com>
Signed-off-by: Vicent Marti <vmg@strn.cat>

* icuregex: implement freeze set optimization

Signed-off-by: Vicent Marti <vmg@strn.cat>

* evalengine: wire up regex

Signed-off-by: Vicent Marti <vmg@strn.cat>

* Fix remaining TODOs and fix a bunch of bugs

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* Update sizegen

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* icuregex: Fix invalid slice creation

Parse the structure so we can create buffers with the proper size and
never with infinite sizes. While this was not the immediate cause of the
race error, it's better to create with the right slice size also for
debugging when digging into it.

The real fix here is that the size of `algorithmicRange` includes the
size of the struct itself, so if we want to get the remaining slice size
it needs to subtract this value.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* icuregex: Create valid slice length for algorithmicRange

We also want to create a valid slice length for the additional data,
this was too long if an offset was given and would read into the next
entry.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* icuregex: Clean up more unsafe usage

This reduces unsafe usage to just udata and doesn't use it anywhere
outside of it. Makes it more Go idiomatic this way.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* icuregex: Use more Go like naming and reduce exposed API

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* regex: Implement additional regular expression functions

This implements the additional MySQL regular expression functions in the
evalengine. The evaluator is only implementing this for now, but the
compiler is up next.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* regexp: Update generated data

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* Revert accidentally committed test

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* evalengine: Add compilation for regular expressions

Also fixes a whole slew of bugs identified.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* icuregex: Allow for setting explicit dumper

Remove the usage of a global variable here.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* evalengine: Add a whole bunch of regex tests

This adds a bunch of tests and fixes the bugs exposed through them.

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* Fix license

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* More license fixes

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

* evalengine: Improve handling of constant expression regexps

Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>

---------

Signed-off-by: Vicent Marti <vmg@strn.cat>
Signed-off-by: Dirkjan Bussink <d.bussink@gmail.com>
Co-authored-by: Dirkjan Bussink <d.bussink@gmail.com>
  • Loading branch information
vmg and dbussink committed Jul 6, 2023
1 parent a7903d1 commit 83761ad
Show file tree
Hide file tree
Showing 76 changed files with 22,460 additions and 20 deletions.
74 changes: 74 additions & 0 deletions go/mysql/collations/charset/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package charset
import (
"fmt"
"unicode/utf8"

"vitess.io/vitess/go/hack"
)

func failedConversionError(from, to Charset, input []byte) error {
Expand Down Expand Up @@ -126,6 +128,78 @@ func Convert(dst []byte, dstCharset Charset, src []byte, srcCharset Charset) ([]
}
}

func Expand(dst []rune, src []byte, srcCharset Charset) []rune {
switch srcCharset := srcCharset.(type) {
case Charset_utf8mb3, Charset_utf8mb4:
if dst == nil {
return []rune(string(src))
}
dst = make([]rune, 0, len(src))
for _, cp := range string(src) {
dst = append(dst, cp)
}
return dst
case Charset_binary:
if dst == nil {
dst = make([]rune, 0, len(src))
}
for _, c := range src {
dst = append(dst, rune(c))
}
return dst
default:
if dst == nil {
dst = make([]rune, 0, len(src))
}
for len(src) > 0 {
cp, width := srcCharset.DecodeRune(src)
src = src[width:]
dst = append(dst, cp)
}
return dst
}
}

func Collapse(dst []byte, src []rune, dstCharset Charset) []byte {
switch dstCharset := dstCharset.(type) {
case Charset_utf8mb3, Charset_utf8mb4:
if dst == nil {
return hack.StringBytes(string(src))
}
return append(dst, hack.StringBytes(string(src))...)
case Charset_binary:
if dst == nil {
dst = make([]byte, 0, len(src))
}
for _, b := range src {
dst = append(dst, byte(b))
}
return dst
default:
nDst := 0
if dst == nil {
dst = make([]byte, len(src)*dstCharset.MaxWidth())
} else {
dst = dst[:cap(dst)]
}
for _, c := range src {
if len(dst)-nDst < 4 {
newDst := make([]byte, len(dst)*2)
copy(newDst, dst[:nDst])
dst = newDst
}
w := dstCharset.EncodeRune(dst[nDst:], c)
if w < 0 {
if w = dstCharset.EncodeRune(dst[nDst:], '?'); w < 0 {
break
}
}
nDst += w
}
return dst[:nDst]
}
}

func ConvertFromUTF8(dst []byte, dstCharset Charset, src []byte) ([]byte, error) {
return Convert(dst, dstCharset, src, Charset_utf8mb4{})
}
Expand Down
9 changes: 5 additions & 4 deletions go/mysql/collations/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,11 @@ func makeEnv(version collver) *Environment {
// A few interesting character set values.
// See http://dev.mysql.com/doc/internals/en/character-set.html#packet-Protocol::CharacterSet
const (
CollationUtf8ID = 33
CollationUtf8mb4ID = 255
CollationBinaryID = 63
CollationUtf8mb4BinID = 46
CollationUtf8ID = 33
CollationUtf8mb4ID = 255
CollationBinaryID = 63
CollationUtf8mb4BinID = 46
CollationLatin1Swedish = 8
)

// Binary is the default Binary collation
Expand Down
25 changes: 25 additions & 0 deletions go/mysql/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,31 @@ const (
ERJSONValueTooBig = ErrorCode(3150)
ERJSONDocumentTooDeep = ErrorCode(3157)

ERRegexpStringNotTerminated = ErrorCode(3684)
ERRegexpBufferOverflow = ErrorCode(3684)
ERRegexpIllegalArgument = ErrorCode(3685)
ERRegexpIndexOutOfBounds = ErrorCode(3686)
ERRegexpInternal = ErrorCode(3687)
ERRegexpRuleSyntax = ErrorCode(3688)
ERRegexpBadEscapeSequence = ErrorCode(3689)
ERRegexpUnimplemented = ErrorCode(3690)
ERRegexpMismatchParen = ErrorCode(3691)
ERRegexpBadInterval = ErrorCode(3692)
ERRRegexpMaxLtMin = ErrorCode(3693)
ERRegexpInvalidBackRef = ErrorCode(3694)
ERRegexpLookBehindLimit = ErrorCode(3695)
ERRegexpMissingCloseBracket = ErrorCode(3696)
ERRegexpInvalidRange = ErrorCode(3697)
ERRegexpStackOverflow = ErrorCode(3698)
ERRegexpTimeOut = ErrorCode(3699)
ERRegexpPatternTooBig = ErrorCode(3700)
ERRegexpInvalidCaptureGroup = ErrorCode(3887)
ERRegexpInvalidFlag = ErrorCode(3900)

ERCharacterSetMismatch = ErrorCode(3995)

ERWrongParametersToNativeFct = ErrorCode(1583)

// max execution time exceeded
ERQueryTimeout = ErrorCode(3024)

Expand Down
Loading

0 comments on commit 83761ad

Please sign in to comment.