diff --git a/go/mysql/collations/charset/convert.go b/go/mysql/collations/charset/convert.go index 1c0ced27e4e..6054ae33559 100644 --- a/go/mysql/collations/charset/convert.go +++ b/go/mysql/collations/charset/convert.go @@ -19,6 +19,8 @@ package charset import ( "fmt" "unicode/utf8" + + "vitess.io/vitess/go/hack" ) func failedConversionError(from, to Charset, input []byte) error { @@ -126,6 +128,78 @@ func Convert(dst []byte, dstCharset Charset, src []byte, srcCharset Charset) ([] } } +func Expand(dst []rune, src []byte, srcCharset Charset) []rune { + switch srcCharset := srcCharset.(type) { + case Charset_utf8mb3, Charset_utf8mb4: + if dst == nil { + return []rune(string(src)) + } + dst = make([]rune, 0, len(src)) + for _, cp := range string(src) { + dst = append(dst, cp) + } + return dst + case Charset_binary: + if dst == nil { + dst = make([]rune, 0, len(src)) + } + for _, c := range src { + dst = append(dst, rune(c)) + } + return dst + default: + if dst == nil { + dst = make([]rune, 0, len(src)) + } + for len(src) > 0 { + cp, width := srcCharset.DecodeRune(src) + src = src[width:] + dst = append(dst, cp) + } + return dst + } +} + +func Collapse(dst []byte, src []rune, dstCharset Charset) []byte { + switch dstCharset := dstCharset.(type) { + case Charset_utf8mb3, Charset_utf8mb4: + if dst == nil { + return hack.StringBytes(string(src)) + } + return append(dst, hack.StringBytes(string(src))...) + case Charset_binary: + if dst == nil { + dst = make([]byte, 0, len(src)) + } + for _, b := range src { + dst = append(dst, byte(b)) + } + return dst + default: + nDst := 0 + if dst == nil { + dst = make([]byte, len(src)*dstCharset.MaxWidth()) + } else { + dst = dst[:cap(dst)] + } + for _, c := range src { + if len(dst)-nDst < 4 { + newDst := make([]byte, len(dst)*2) + copy(newDst, dst[:nDst]) + dst = newDst + } + w := dstCharset.EncodeRune(dst[nDst:], c) + if w < 0 { + if w = dstCharset.EncodeRune(dst[nDst:], '?'); w < 0 { + break + } + } + nDst += w + } + return dst[:nDst] + } +} + func ConvertFromUTF8(dst []byte, dstCharset Charset, src []byte) ([]byte, error) { return Convert(dst, dstCharset, src, Charset_utf8mb4{}) } diff --git a/go/mysql/collations/env.go b/go/mysql/collations/env.go index 52a255b6f41..0c063e140d5 100644 --- a/go/mysql/collations/env.go +++ b/go/mysql/collations/env.go @@ -194,10 +194,11 @@ func makeEnv(version collver) *Environment { // A few interesting character set values. // See http://dev.mysql.com/doc/internals/en/character-set.html#packet-Protocol::CharacterSet const ( - CollationUtf8ID = 33 - CollationUtf8mb4ID = 255 - CollationBinaryID = 63 - CollationUtf8mb4BinID = 46 + CollationUtf8ID = 33 + CollationUtf8mb4ID = 255 + CollationBinaryID = 63 + CollationUtf8mb4BinID = 46 + CollationLatin1Swedish = 8 ) // Binary is the default Binary collation diff --git a/go/mysql/constants.go b/go/mysql/constants.go index b2c9b4d49a5..bedc9871426 100644 --- a/go/mysql/constants.go +++ b/go/mysql/constants.go @@ -565,6 +565,31 @@ const ( ERJSONValueTooBig = ErrorCode(3150) ERJSONDocumentTooDeep = ErrorCode(3157) + ERRegexpStringNotTerminated = ErrorCode(3684) + ERRegexpBufferOverflow = ErrorCode(3684) + ERRegexpIllegalArgument = ErrorCode(3685) + ERRegexpIndexOutOfBounds = ErrorCode(3686) + ERRegexpInternal = ErrorCode(3687) + ERRegexpRuleSyntax = ErrorCode(3688) + ERRegexpBadEscapeSequence = ErrorCode(3689) + ERRegexpUnimplemented = ErrorCode(3690) + ERRegexpMismatchParen = ErrorCode(3691) + ERRegexpBadInterval = ErrorCode(3692) + ERRRegexpMaxLtMin = ErrorCode(3693) + ERRegexpInvalidBackRef = ErrorCode(3694) + ERRegexpLookBehindLimit = ErrorCode(3695) + ERRegexpMissingCloseBracket = ErrorCode(3696) + ERRegexpInvalidRange = ErrorCode(3697) + ERRegexpStackOverflow = ErrorCode(3698) + ERRegexpTimeOut = ErrorCode(3699) + ERRegexpPatternTooBig = ErrorCode(3700) + ERRegexpInvalidCaptureGroup = ErrorCode(3887) + ERRegexpInvalidFlag = ErrorCode(3900) + + ERCharacterSetMismatch = ErrorCode(3995) + + ERWrongParametersToNativeFct = ErrorCode(1583) + // max execution time exceeded ERQueryTimeout = ErrorCode(3024) diff --git a/go/mysql/icuregex/compiler.go b/go/mysql/icuregex/compiler.go new file mode 100644 index 00000000000..eba297d0f21 --- /dev/null +++ b/go/mysql/icuregex/compiler.go @@ -0,0 +1,3647 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "math" + "strings" + "unicode/utf8" + + "golang.org/x/exp/slices" + + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/unames" + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" + "vitess.io/vitess/go/mysql/icuregex/internal/uset" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +const BreakIteration = false +const stackSize = 100 + +type reChar struct { + char rune + quoted bool +} + +const ( + parenPlain = -1 + parenCapturing = -2 + parenAtomic = -3 + parenLookahead = -4 + parenNegLookahead = -5 + parenFlags = -6 + parenLookBehind = -7 + parenLookBehindN = -8 +) + +type setOperation uint32 + +const ( + setStart setOperation = 0<<16 | 1 + setEnd setOperation = 1<<16 | 2 + setNegation setOperation = 2<<16 | 3 + setCaseClose setOperation = 2<<16 | 9 + setDifference2 setOperation = 3<<16 | 4 // '--' set difference operator + setIntersection2 setOperation = 3<<16 | 5 // '&&' set intersection operator + setUnion setOperation = 4<<16 | 6 // implicit union of adjacent items + setDifference1 setOperation = 4<<16 | 7 // '-', single dash difference op, for compatibility with old UnicodeSet. + setIntersection1 setOperation = 4<<16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. +) + +type compiler struct { + err error + out *Pattern + p []rune + + scanIndex int + quoteMode bool + inBackslashQuote bool + eolComments bool + + lineNum int + charNum int + lastChar rune + peekChar rune + + c reChar + stack [stackSize]uint16 + stackPtr int + + modeFlags RegexpFlag + newModeFlags RegexpFlag + setModeFlag bool + + literalChars []rune + + parenStack []int + matchOpenParen int + matchCloseParen int + + intervalLow int + intervalUpper int + + setStack []*uset.UnicodeSet + setOpStack []setOperation + + lastSetLiteral rune + captureName *strings.Builder +} + +func newCompiler(pat *Pattern) *compiler { + return &compiler{ + out: pat, + scanIndex: 0, + eolComments: true, + lineNum: 1, + charNum: 0, + lastChar: -1, + peekChar: -1, + modeFlags: RegexpFlag(uint32(pat.flags) | 0x80000000), + matchOpenParen: -1, + matchCloseParen: -1, + lastSetLiteral: -1, + } +} + +func (c *compiler) nextCharLL() (ch rune) { + if c.peekChar != -1 { + ch, c.peekChar = c.peekChar, -1 + return + } + if len(c.p) == 0 { + return -1 + } + + ch = c.p[0] + c.p = c.p[1:] + if ch == utf8.RuneError { + return -1 + } + + if ch == chCR || ch == chNEL || ch == chLS || (ch == chLF && c.lastChar != chCR) { + c.lineNum++ + c.charNum = 0 + } else { + if ch != chLF { + c.charNum++ + } + } + c.lastChar = ch + return +} + +func (c *compiler) peekCharLL() rune { + if c.peekChar == -1 { + c.peekChar = c.nextCharLL() + } + return c.peekChar +} + +func (c *compiler) nextChar(ch *reChar) { + c.scanIndex++ + ch.char = c.nextCharLL() + ch.quoted = false + + if c.quoteMode { + ch.quoted = true + if (ch.char == chBackSlash && c.peekCharLL() == chE && ((c.modeFlags & Literal) == 0)) || + ch.char == -1 { + c.quoteMode = false // Exit quote mode, + c.nextCharLL() // discard the E + c.nextChar(ch) + return + } + } else if c.inBackslashQuote { + // The current character immediately follows a '\' + // Don't check for any further escapes, just return it as-is. + // Don't set c.fQuoted, because that would prevent the state machine from + // dispatching on the character. + c.inBackslashQuote = false + } else { + // We are not in a \Q quoted region \E of the source. + // + if (c.modeFlags & Comments) != 0 { + // + // We are in free-spacing and comments mode. + // Scan through any white space and comments, until we + // reach a significant character or the end of inut. + for { + if ch.char == -1 { + break // End of Input + } + if ch.char == chPound && c.eolComments { + // Start of a comment. Consume the rest of it, until EOF or a new line + for { + ch.char = c.nextCharLL() + if ch.char == -1 || // EOF + ch.char == chCR || + ch.char == chLF || + ch.char == chNEL || + ch.char == chLS { + break + } + } + } + // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. + if !pattern.IsWhitespace(ch.char) { + break + } + ch.char = c.nextCharLL() + } + } + + // + // check for backslash escaped characters. + // + if ch.char == chBackSlash { + beforeEscape := c.p + if staticSetUnescape.ContainsRune(c.peekCharLL()) { + // + // A '\' sequence that is handled by ICU's standard unescapeAt function. + // Includes \uxxxx, \n, \r, many others. + // Return the single equivalent character. + // + c.nextCharLL() // get & discard the peeked char. + ch.quoted = true + + ch.char, c.p = pattern.UnescapeAtRunes(beforeEscape) + if ch.char < 0 { + c.error(BadEscapeSequence) + } + c.charNum += len(beforeEscape) - len(c.p) + } else if c.peekCharLL() == chDigit0 { + // Octal Escape, using Java Regexp Conventions + // which are \0 followed by 1-3 octal digits. + // Different from ICU Unescape handling of Octal, which does not + // require the leading 0. + // Java also has the convention of only consuming 2 octal digits if + // the three digit number would be > 0xff + // + ch.char = 0 + c.nextCharLL() // Consume the initial 0. + for index := 0; index < 3; index++ { + ch2 := c.peekCharLL() + if ch2 < chDigit0 || ch2 > chDigit7 { + if index == 0 { + // \0 is not followed by any octal digits. + c.error(BadEscapeSequence) + } + break + } + ch.char <<= 3 + ch.char += ch2 & 7 + if ch.char <= 255 { + c.nextCharLL() + } else { + // The last digit made the number too big. Forget we saw it. + ch.char >>= 3 + } + } + ch.quoted = true + } else if c.peekCharLL() == chQ { + // "\Q" enter quote mode, which will continue until "\E" + c.quoteMode = true + c.nextCharLL() // discard the 'Q'. + c.nextChar(ch) // recurse to get the real next char. + return + } else { + // We are in a '\' escape that will be handled by the state table scanner. + // Just return the backslash, but remember that the following char is to + // be taken literally. + c.inBackslashQuote = true + } + } + } + + // re-enable # to end-of-line comments, in case they were disabled. + // They are disabled by the parser upon seeing '(?', but this lasts for + // the fetching of the next character only. + c.eolComments = true +} + +const ( + chCR = 0x0d // New lines, for terminating comments. + chLF = 0x0a // Line Feed + chPound = 0x23 // '#', introduces a comment. + chDigit0 = 0x30 // '0' + chDigit7 = 0x37 // '9' + chColon = 0x3A // ':' + chE = 0x45 // 'E' + chQ = 0x51 // 'Q' + chN = 0x4E // 'N' + chP = 0x50 // 'P' + chBackSlash = 0x5c // '\' introduces a char escape + chLBracket = 0x5b // '[' + chRBracket = 0x5d // ']' + chUp = 0x5e // '^' + chLowerP = 0x70 + chLBrace = 0x7b // '{' + chRBrace = 0x7d // '}' + chNEL = 0x85 // NEL newline variant + chLS = 0x2028 // Unicode Line Separator + chAmp = 0x26 // '&' + chDash = 0x2d // '-' +) + +func (c *compiler) compile(pat []rune) error { + if c.err != nil { + return c.err + } + if c.out.pattern != "" { + panic("cannot reuse pattern") + } + + c.out.pattern = string(pat) + c.p = pat + + var state uint16 = 1 + var table []regexTableEl + + // UREGEX_LITERAL force entire pattern to be treated as a literal string. + if c.modeFlags&Literal != 0 { + c.quoteMode = true + } + + c.nextChar(&c.c) + + // Main loop for the regex pattern parsing state machine. + // Runs once per state transition. + // Each time through optionally performs, depending on the state table, + // - an advance to the the next pattern char + // - an action to be performed. + // - pushing or popping a state to/from the local state return stack. + // file regexcst.txt is the source for the state table. The logic behind + // recongizing the pattern syntax is there, not here. + for { + if c.err != nil { + break + } + + if state == 0 { + panic("bad state?") + } + + table = parseStateTable[state:] + for len(table) > 0 { + if table[0].charClass < 127 && !c.c.quoted && rune(table[0].charClass) == c.c.char { + break + } + if table[0].charClass == 255 { + break + } + if table[0].charClass == 254 && c.c.quoted { + break + } + if table[0].charClass == 253 && c.c.char == -1 { + break + } + if table[0].charClass >= 128 && table[0].charClass < 240 && !c.c.quoted && c.c.char != -1 { + if staticRuleSet[table[0].charClass-128].ContainsRune(c.c.char) { + break + } + } + + table = table[1:] + } + + if !c.doParseActions(table[0].action) { + break + } + + if table[0].pushState != 0 { + c.stackPtr++ + if c.stackPtr >= stackSize { + c.error(InternalError) + c.stackPtr-- + } + c.stack[c.stackPtr] = uint16(table[0].pushState) + } + + if table[0].nextChar { + c.nextChar(&c.c) + } + + if table[0].nextState != 255 { + state = uint16(table[0].nextState) + } else { + state = c.stack[c.stackPtr] + c.stackPtr-- + if c.stackPtr < 0 { + c.stackPtr++ + c.error(MismatchedParen) + } + } + } + + if c.err != nil { + return c.err + } + + c.allocateStackData(restackframeHdrCount) + c.stripNOPs() + + c.out.minMatchLen = c.minMatchLength(3, len(c.out.compiledPat)-1) + + c.matchStartType() + return c.err +} + +func (c *compiler) doParseActions(action patternParseAction) bool { + switch action { + case doPatStart: + // Start of pattern compiles to: + //0 SAVE 2 Fall back to position of FAIL + //1 jmp 3 + //2 FAIL Stop if we ever reach here. + //3 NOP Dummy, so start of pattern looks the same as + // the start of an ( grouping. + //4 NOP Resreved, will be replaced by a save if there are + // OR | operators at the top level + c.appendOp(urxStateSave, 2) + c.appendOp(urxJmp, 3) + c.appendOp(urxFail, 0) + + // Standard open nonCapture paren action emits the two NOPs and + // sets up the paren stack frame. + c.doParseActions(doOpenNonCaptureParen) + + case doPatFinish: + // We've scanned to the end of the pattern + // The end of pattern compiles to: + // URX_END + // which will stop the runtime match engine. + // Encountering end of pattern also behaves like a close paren, + // and forces fixups of the State Save at the beginning of the compiled pattern + // and of any OR operations at the top level. + // + c.handleCloseParen() + if len(c.parenStack) > 0 { + // Missing close paren in pattern. + c.error(MismatchedParen) + } + + // add the END operation to the compiled pattern. + c.appendOp(urxEnd, 0) + + // Terminate the pattern compilation state machine. + return false + + case doOrOperator: + // Scanning a '|', as in (A|B) + // Generate code for any pending literals preceding the '|' + c.fixLiterals(false) + + // Insert a SAVE operation at the start of the pattern section preceding + // this OR at this level. This SAVE will branch the match forward + // to the right hand side of the OR in the event that the left hand + // side fails to match and backtracks. Locate the position for the + // save from the location on the top of the parentheses stack. + var savePosition int + savePosition, c.parenStack = stackPop(c.parenStack) + op := c.out.compiledPat[savePosition] + + if op.typ() != urxNop { + panic("expected a NOP placeholder") + } + + op = c.buildOp(urxStateSave, len(c.out.compiledPat)+1) + c.out.compiledPat[savePosition] = op + + // Append an JMP operation into the compiled pattern. The operand for + // the JMP will eventually be the location following the ')' for the + // group. This will be patched in later, when the ')' is encountered. + c.appendOp(urxJmp, 0) + + // Push the position of the newly added JMP op onto the parentheses stack. + // This registers if for fixup when this block's close paren is encountered. + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // Append a NOP to the compiled pattern. This is the slot reserved + // for a SAVE in the event that there is yet another '|' following + // this one. + c.appendOp(urxNop, 0) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + case doBeginNamedCapture: + // Scanning (? + // Compile to a + // - NOP, which later may be replaced if the parenthesized group + // has a quantifier, followed by + // - STO_SP save state stack position, so it can be restored at the ")" + // - NOP, which may later be replaced by a save-state if there + // is an '|' alternation within the parens. + c.fixLiterals(false) + c.appendOp(urxNop, 0) + varLoc := c.allocateData(1) // Reserve a data location for saving the state stack ptr. + c.appendOp(urxStoSp, varLoc) + c.appendOp(urxNop, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the two NOPs. Depending on what follows in the pattern, the + // NOPs may be changed to SAVE_STATE or JMP ops, with a target + // address of the end of the parenthesized group. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenAtomic) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-3) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + case doOpenLookAhead: + // Positive Look-ahead (?= stuff ) + // + // Note: Addition of transparent input regions, with the need to + // restore the original regions when failing out of a lookahead + // block, complicated this sequence. Some combined opcodes + // might make sense - or might not, lookahead aren't that common. + // + // Caution: min match length optimization knows about this + // sequence; don't change without making updates there too. + // + // Compiles to + // 1 LA_START dataLoc Saves SP, Input Pos, Active input region. + // 2. STATE_SAVE 4 on failure of lookahead, goto 4 + // 3 JMP 6 continue ... + // + // 4. LA_END Look Ahead failed. Restore regions. + // 5. BACKTRACK and back track again. + // + // 6. NOP reserved for use by quantifiers on the block. + // Look-ahead can't have quantifiers, but paren stack + // compile time conventions require the slot anyhow. + // 7. NOP may be replaced if there is are '|' ops in the block. + // 8. code for parenthesized stuff. + // 9. LA_END + // + // Four data slots are reserved, for saving state on entry to the look-around + // 0: stack pointer on entry. + // 1: input position on entry. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + c.fixLiterals(false) + dataLoc := c.allocateData(4) + c.appendOp(urxLaStart, dataLoc) + c.appendOp(urxStateSave, len(c.out.compiledPat)+2) + c.appendOp(urxJmp, len(c.out.compiledPat)+3) + c.appendOp(urxLaEnd, dataLoc) + c.appendOp(urxBacktrack, 0) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the NOPs. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenLookahead) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + case doOpenLookAheadNeg: + // Negated Lookahead. (?! stuff ) + // Compiles to + // 1. LA_START dataloc + // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, + // // which continues with the match. + // 3. NOP // Std. Open Paren sequence, for possible '|' + // 4. code for parenthesized stuff. + // 5. LA_END // Cut back stack, remove saved state from step 2. + // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. + // 7. END_LA // Restore match region, in case look-ahead was using + // an alternate (transparent) region. + // Four data slots are reserved, for saving state on entry to the look-around + // 0: stack pointer on entry. + // 1: input position on entry. + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + c.fixLiterals(false) + dataLoc := c.allocateData(4) + c.appendOp(urxLaStart, dataLoc) + c.appendOp(urxStateSave, 0) // dest address will be patched later. + c.appendOp(urxNop, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the StateSave and NOP. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenNegLookahead) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // Instructions #5 - #7 will be added when the ')' is encountered. + + case doOpenLookBehind: + // Compile a (?<= look-behind open paren. + // + // Compiles to + // 0 URX_LB_START dataLoc + // 1 URX_LB_CONT dataLoc + // 2 MinMatchLen + // 3 MaxMatchLen + // 4 URX_NOP Standard '(' boilerplate. + // 5 URX_NOP Reserved slot for use with '|' ops within (block). + // 6 + // 7 URX_LB_END dataLoc # Check match len, restore input len + // 8 URX_LA_END dataLoc # Restore stack, input pos + // + // Allocate a block of matcher data, to contain (when running a match) + // 0: Stack ptr on entry + // 1: Input Index on entry + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + // 4: Start index of match current match attempt. + // The first four items must match the layout of data for LA_START / LA_END + + // Generate match code for any pending literals. + c.fixLiterals(false) + + // Allocate data space + dataLoc := c.allocateData(5) + + // Emit URX_LB_START + c.appendOp(urxLbStart, dataLoc) + + // Emit URX_LB_CONT + c.appendOp(urxLbCont, dataLoc) + c.appendOp(urxReservedOp, 0) // MinMatchLength. To be filled later. + c.appendOp(urxReservedOp, 0) // MaxMatchLength. To be filled later. + + // Emit the NOPs + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the URX_LB_CONT and the NOP. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenLookBehind) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // The final two instructions will be added when the ')' is encountered. + + case doOpenLookBehindNeg: + // Compile a (? + // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL + // 9 ... + // + // Allocate a block of matcher data, to contain (when running a match) + // 0: Stack ptr on entry + // 1: Input Index on entry + // 2: fActiveStart, the active bounds start on entry. + // 3: fActiveLimit, the active bounds limit on entry. + // 4: Start index of match current match attempt. + // The first four items must match the layout of data for LA_START / LA_END + + // Generate match code for any pending literals. + c.fixLiterals(false) + + // Allocate data space + dataLoc := c.allocateData(5) + + // Emit URX_LB_START + c.appendOp(urxLbStart, dataLoc) + + // Emit URX_LBN_CONT + c.appendOp(urxLbnCount, dataLoc) + c.appendOp(urxReservedOp, 0) // MinMatchLength. To be filled later. + c.appendOp(urxReservedOp, 0) // MaxMatchLength. To be filled later. + c.appendOp(urxReservedOp, 0) // Continue Loc. To be filled later. + + // Emit the NOPs + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the URX_LB_CONT and the NOP. + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenLookBehindN) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // The final two instructions will be added when the ')' is encountered. + + case doConditionalExpr, doPerlInline: + // Conditionals such as (?(1)a:b) + // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. + c.error(Unimplemented) + + case doCloseParen: + c.handleCloseParen() + if len(c.parenStack) == 0 { + // Extra close paren, or missing open paren. + c.error(MismatchedParen) + } + + case doNOP: + + case doBadOpenParenType, doRuleError: + c.error(RuleSyntax) + + case doMismatchedParenErr: + c.error(MismatchedParen) + + case doPlus: + // Normal '+' compiles to + // 1. stuff to be repeated (already built) + // 2. jmp-sav 1 + // 3. ... + // + // Or, if the item to be repeated can match a zero length string, + // 1. STO_INP_LOC data-loc + // 2. body of stuff to be repeated + // 3. JMP_SAV_X 2 + // 4. ... + + // + // Or, if the item to be repeated is simple + // 1. Item to be repeated. + // 2. LOOP_SR_I set number (assuming repeated item is a set ref) + // 3. LOOP_C stack location + topLoc := c.blockTopLoc(false) // location of item #1 + + // Check for simple constructs, which may get special optimized code. + if topLoc == len(c.out.compiledPat)-1 { + repeatedOp := c.out.compiledPat[topLoc] + + if repeatedOp.typ() == urxSetref { + // Emit optimized code for [char set]+ + c.appendOp(urxLoopSrI, repeatedOp.value()) + frameLoc := c.allocateStackData(1) + c.appendOp(urxLoopC, frameLoc) + break + } + + if repeatedOp.typ() == urxDotany || repeatedOp.typ() == urxDotanyAll || repeatedOp.typ() == urxDotanyUnix { + // Emit Optimized code for .+ operations. + loopOpI := c.buildOp(urxLoopDotI, 0) + if repeatedOp.typ() == urxDotanyAll { + // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. + loopOpI |= 1 + } + if c.modeFlags&UnixLines != 0 { + loopOpI |= 2 + } + c.appendIns(loopOpI) + frameLoc := c.allocateStackData(1) + c.appendOp(urxLoopC, frameLoc) + break + } + } + + // General case. + + // Check for minimum match length of zero, which requires + // extra loop-breaking code. + if c.minMatchLength(topLoc, len(c.out.compiledPat)-1) == 0 { + // Zero length match is possible. + // Emit the code sequence that can handle it. + c.insertOp(topLoc) + frameLoc := c.allocateStackData(1) + op := c.buildOp(urxStoInpLoc, frameLoc) + c.out.compiledPat[topLoc] = op + + c.appendOp(urxJmpSavX, topLoc+1) + } else { + // Simpler code when the repeated body must match something non-empty + c.appendOp(urxJmpSav, topLoc) + } + + case doNGPlus: + // Non-greedy '+?' compiles to + // 1. stuff to be repeated (already built) + // 2. state-save 1 + // 3. ... + topLoc := c.blockTopLoc(false) + c.appendOp(urxStateSave, topLoc) + + case doOpt: + // Normal (greedy) ? quantifier. + // Compiles to + // 1. state save 3 + // 2. body of optional block + // 3. ... + // Insert the state save into the compiled pattern, and we're done. + saveStateLoc := c.blockTopLoc(true) + saveStateOp := c.buildOp(urxStateSave, len(c.out.compiledPat)) + c.out.compiledPat[saveStateLoc] = saveStateOp + + case doNGOpt: + // Non-greedy ?? quantifier + // compiles to + // 1. jmp 4 + // 2. body of optional block + // 3 jmp 5 + // 4. state save 2 + // 5 ... + // This code is less than ideal, with two jmps instead of one, because we can only + // insert one instruction at the top of the block being iterated. + jmp1Loc := c.blockTopLoc(true) + jmp2Loc := len(c.out.compiledPat) + + jmp1Op := c.buildOp(urxJmp, jmp2Loc+1) + c.out.compiledPat[jmp1Loc] = jmp1Op + + c.appendOp(urxJmp, jmp2Loc+2) + c.appendOp(urxStateSave, jmp1Loc+1) + + case doStar: + // Normal (greedy) * quantifier. + // Compiles to + // 1. STATE_SAVE 4 + // 2. body of stuff being iterated over + // 3. JMP_SAV 2 + // 4. ... + // + // Or, if the body is a simple [Set], + // 1. LOOP_SR_I set number + // 2. LOOP_C stack location + // ... + // + // Or if this is a .* + // 1. LOOP_DOT_I (. matches all mode flag) + // 2. LOOP_C stack location + // + // Or, if the body can match a zero-length string, to inhibit infinite loops, + // 1. STATE_SAVE 5 + // 2. STO_INP_LOC data-loc + // 3. body of stuff + // 4. JMP_SAV_X 2 + // 5. ... + // location of item #1, the STATE_SAVE + topLoc := c.blockTopLoc(false) + + // Check for simple *, where the construct being repeated + // compiled to single opcode, and might be optimizable. + if topLoc == len(c.out.compiledPat)-1 { + repeatedOp := c.out.compiledPat[topLoc] + + if repeatedOp.typ() == urxSetref { + // Emit optimized code for a [char set]* + loopOpI := c.buildOp(urxLoopSrI, repeatedOp.value()) + c.out.compiledPat[topLoc] = loopOpI + dataLoc := c.allocateStackData(1) + c.appendOp(urxLoopC, dataLoc) + break + } + + if repeatedOp.typ() == urxDotany || repeatedOp.typ() == urxDotanyAll || repeatedOp.typ() == urxDotanyUnix { + // Emit Optimized code for .* operations. + loopOpI := c.buildOp(urxLoopDotI, 0) + if repeatedOp.typ() == urxDotanyAll { + // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. + loopOpI |= 1 + } + if (c.modeFlags & UnixLines) != 0 { + loopOpI |= 2 + } + c.out.compiledPat[topLoc] = loopOpI + dataLoc := c.allocateStackData(1) + c.appendOp(urxLoopC, dataLoc) + break + } + } + + // Emit general case code for this * + // The optimizations did not apply. + + saveStateLoc := c.blockTopLoc(true) + jmpOp := c.buildOp(urxJmpSav, saveStateLoc+1) + + // Check for minimum match length of zero, which requires + // extra loop-breaking code. + if c.minMatchLength(saveStateLoc, len(c.out.compiledPat)-1) == 0 { + c.insertOp(saveStateLoc) + dataLoc := c.allocateStackData(1) + + op := c.buildOp(urxStoInpLoc, dataLoc) + c.out.compiledPat[saveStateLoc+1] = op + jmpOp = c.buildOp(urxJmpSavX, saveStateLoc+2) + } + + // Locate the position in the compiled pattern where the match will continue + // after completing the *. (4 or 5 in the comment above) + continueLoc := len(c.out.compiledPat) + 1 + + // Put together the save state op and store it into the compiled code. + saveStateOp := c.buildOp(urxStateSave, continueLoc) + c.out.compiledPat[saveStateLoc] = saveStateOp + + // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. + c.appendIns(jmpOp) + + case doNGStar: + // Non-greedy *? quantifier + // compiles to + // 1. JMP 3 + // 2. body of stuff being iterated over + // 3. STATE_SAVE 2 + // 4 ... + jmpLoc := c.blockTopLoc(true) // loc 1. + saveLoc := len(c.out.compiledPat) // loc 3. + jmpOp := c.buildOp(urxJmp, saveLoc) + c.out.compiledPat[jmpLoc] = jmpOp + c.appendOp(urxStateSave, jmpLoc+1) + + case doIntervalInit: + // The '{' opening an interval quantifier was just scanned. + // Init the counter varaiables that will accumulate the values as the digits + // are scanned. + c.intervalLow = 0 + c.intervalUpper = -1 + + case doIntevalLowerDigit: + // Scanned a digit from the lower value of an {lower,upper} interval + digitValue := uCharDigitValue(c.c.char) + val := int64(c.intervalLow)*10 + digitValue + if val > math.MaxInt32 { + c.error(NumberTooBig) + } else { + c.intervalLow = int(val) + } + + case doIntervalUpperDigit: + // Scanned a digit from the upper value of an {lower,upper} interval + if c.intervalUpper < 0 { + c.intervalUpper = 0 + } + digitValue := uCharDigitValue(c.c.char) + val := int64(c.intervalUpper)*10 + digitValue + if val > math.MaxInt32 { + c.error(NumberTooBig) + } else { + c.intervalUpper = int(val) + } + + case doIntervalSame: + // Scanned a single value interval like {27}. Upper = Lower. + c.intervalUpper = c.intervalLow + + case doInterval: + // Finished scanning a normal {lower,upper} interval. Generate the code for it. + if !c.compileInlineInterval() { + c.compileInterval(urxCtrInit, utxCtrLoop) + } + + case doPossessiveInterval: + // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. + + // Remember the loc for the top of the block being looped over. + // (Can not reserve a slot in the compiled pattern at this time, because + // compileInterval needs to reserve also, and blockTopLoc can only reserve + // once per block.) + topLoc := c.blockTopLoc(false) + + // Produce normal looping code. + c.compileInterval(urxCtrInit, utxCtrLoop) + + // Surround the just-emitted normal looping code with a STO_SP ... LD_SP + // just as if the loop was inclosed in atomic parentheses. + + // First the STO_SP before the start of the loop + c.insertOp(topLoc) + + varLoc := c.allocateData(1) // Reserve a data location for saving the + op := c.buildOp(urxStoSp, varLoc) + c.out.compiledPat[topLoc] = op + + var loopOp instruction + loopOp, c.out.compiledPat = stackPop(c.out.compiledPat) + if loopOp.typ() != utxCtrLoop || loopOp.value() != topLoc { + panic("bad instruction at the end of compiled pattern") + } + + loopOp++ // point LoopOp after the just-inserted STO_SP + c.appendIns(loopOp) + + // Then the LD_SP after the end of the loop + c.appendOp(urxLdSp, varLoc) + + case doNGInterval: + // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. + c.compileInterval(urxCtrInitNg, urxCtrLoopNg) + + case doIntervalError: + c.error(BadInterval) + + case doLiteralChar: + // We've just scanned a "normal" character from the pattern, + c.literalChar(c.c.char) + + case doEscapedLiteralChar: + // We've just scanned an backslashed escaped character with no + // special meaning. It represents itself. + if (c.modeFlags&ErrorOnUnknownEscapes) != 0 && ((c.c.char >= 0x41 && c.c.char <= 0x5A) || /* in [A-Z] */ (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] + c.error(BadEscapeSequence) + } + c.literalChar(c.c.char) + + case doDotAny: + // scanned a ".", match any single character. + c.fixLiterals(false) + if (c.modeFlags & DotAll) != 0 { + c.appendOp(urxDotanyAll, 0) + } else if (c.modeFlags & UnixLines) != 0 { + c.appendOp(urxDotanyUnix, 0) + } else { + c.appendOp(urxDotany, 0) + } + + case doCaret: + c.fixLiterals(false) + if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxCaret, 0) + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxCaretM, 0) + } else if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxCaret, 0) // Only testing true start of input. + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxCaretMUnix, 0) + } + + case doDollar: + c.fixLiterals(false) + if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxDollar, 0) + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) == 0 { + c.appendOp(urxDollarM, 0) + } else if (c.modeFlags&Multiline) == 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxDollarD, 0) + } else if (c.modeFlags&Multiline) != 0 && (c.modeFlags&UnixLines) != 0 { + c.appendOp(urxDollarMd, 0) + } + + case doBackslashA: + c.fixLiterals(false) + c.appendOp(urxCaret, 0) + + case doBackslashB: + if !BreakIteration { + if (c.modeFlags & UWord) != 0 { + c.error(Unimplemented) + } + } + c.fixLiterals(false) + if c.modeFlags&UWord != 0 { + c.appendOp(urxBackslashBu, 1) + } else { + c.appendOp(urxBackslashB, 1) + } + + case doBackslashb: + if !BreakIteration { + if (c.modeFlags & UWord) != 0 { + c.error(Unimplemented) + } + } + c.fixLiterals(false) + if c.modeFlags&UWord != 0 { + c.appendOp(urxBackslashBu, 0) + } else { + c.appendOp(urxBackslashB, 0) + } + + case doBackslashD: + c.fixLiterals(false) + c.appendOp(urxBackslashD, 1) + + case doBackslashd: + c.fixLiterals(false) + c.appendOp(urxBackslashD, 0) + + case doBackslashG: + c.fixLiterals(false) + c.appendOp(urxBackslashG, 0) + + case doBackslashH: + c.fixLiterals(false) + c.appendOp(urxBackslashH, 1) + + case doBackslashh: + c.fixLiterals(false) + c.appendOp(urxBackslashH, 0) + + case doBackslashR: + c.fixLiterals(false) + c.appendOp(urxBackslashR, 0) + + case doBackslashS: + c.fixLiterals(false) + c.appendOp(urxStatSetrefN, urxIsspaceSet) + + case doBackslashs: + c.fixLiterals(false) + c.appendOp(urxStaticSetref, urxIsspaceSet) + + case doBackslashV: + c.fixLiterals(false) + c.appendOp(urxBackslashV, 1) + + case doBackslashv: + c.fixLiterals(false) + c.appendOp(urxBackslashV, 0) + + case doBackslashW: + c.fixLiterals(false) + c.appendOp(urxStatSetrefN, urxIswordSet) + + case doBackslashw: + c.fixLiterals(false) + c.appendOp(urxStaticSetref, urxIswordSet) + + case doBackslashX: + if !BreakIteration { + // Grapheme Cluster Boundary requires ICU break iteration. + c.error(Unimplemented) + } + c.fixLiterals(false) + c.appendOp(urxBackslashX, 0) + + case doBackslashZ: + c.fixLiterals(false) + c.appendOp(urxDollar, 0) + + case doBackslashz: + c.fixLiterals(false) + c.appendOp(urxBackslashZ, 0) + + case doEscapeError: + c.error(BadEscapeSequence) + + case doExit: + c.fixLiterals(false) + return false + + case doProperty: + c.fixLiterals(false) + theSet := c.scanProp() + c.compileSet(theSet) + + case doNamedChar: + ch := c.scanNamedChar() + c.literalChar(ch) + + case doBackRef: + // BackReference. Somewhat unusual in that the front-end can not completely parse + // the regular expression, because the number of digits to be consumed + // depends on the number of capture groups that have been defined. So + // we have to do it here instead. + numCaptureGroups := len(c.out.groupMap) + groupNum := int64(0) + ch := c.c.char + + for { + // Loop once per digit, for max allowed number of digits in a back reference. + digit := uCharDigitValue(ch) + groupNum = groupNum*10 + digit + if groupNum >= int64(numCaptureGroups) { + break + } + ch = c.peekCharLL() + if !staticRuleSet[ruleSetDigitChar-128].ContainsRune(ch) { + break + } + c.nextCharLL() + } + + // Scan of the back reference in the source regexp is complete. Now generate + // the compiled code for it. + // Because capture groups can be forward-referenced by back-references, + // we fill the operand with the capture group number. At the end + // of compilation, it will be changed to the variable's location. + if groupNum == 0 { + panic("\\0 begins an octal escape sequence, and shouldn't enter this code path at all") + } + c.fixLiterals(false) + if (c.modeFlags & CaseInsensitive) != 0 { + c.appendOp(urxBackrefI, int(groupNum)) + } else { + c.appendOp(urxBackref, int(groupNum)) + } + + case doBeginNamedBackRef: + if c.captureName != nil { + panic("should not replace capture name") + } + c.captureName = &strings.Builder{} + + case doContinueNamedBackRef: + c.captureName.WriteRune(c.c.char) + + case doCompleteNamedBackRef: + { + groupNumber := c.out.namedCaptureMap[c.captureName.String()] + if groupNumber == 0 { + // Group name has not been defined. + // Could be a forward reference. If we choose to support them at some + // future time, extra mechanism will be required at this point. + c.error(InvalidCaptureGroupName) + } else { + // Given the number, handle identically to a \n numbered back reference. + // See comments above, under doBackRef + c.fixLiterals(false) + if (c.modeFlags & CaseInsensitive) != 0 { + c.appendOp(urxBackrefI, groupNumber) + } else { + c.appendOp(urxBackref, groupNumber) + } + } + c.captureName = nil + } + + case doPossessivePlus: + // Possessive ++ quantifier. + // Compiles to + // 1. STO_SP + // 2. body of stuff being iterated over + // 3. STATE_SAVE 5 + // 4. JMP 2 + // 5. LD_SP + // 6. ... + // + // Note: TODO: This is pretty inefficient. A mass of saved state is built up + // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 + // + // Emit the STO_SP + topLoc := c.blockTopLoc(true) + stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. + op := c.buildOp(urxStoSp, stoLoc) + c.out.compiledPat[topLoc] = op + + // Emit the STATE_SAVE + c.appendOp(urxStateSave, len(c.out.compiledPat)+2) + + // Emit the JMP + c.appendOp(urxJmp, topLoc+1) + + // Emit the LD_SP + c.appendOp(urxLdSp, stoLoc) + + case doPossessiveStar: + // Possessive *+ quantifier. + // Compiles to + // 1. STO_SP loc + // 2. STATE_SAVE 5 + // 3. body of stuff being iterated over + // 4. JMP 2 + // 5. LD_SP loc + // 6 ... + // TODO: do something to cut back the state stack each time through the loop. + // Reserve two slots at the top of the block. + topLoc := c.blockTopLoc(true) + c.insertOp(topLoc) + + // emit STO_SP loc + stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. + op := c.buildOp(urxStoSp, stoLoc) + c.out.compiledPat[topLoc] = op + + // Emit the SAVE_STATE 5 + L7 := len(c.out.compiledPat) + 1 + op = c.buildOp(urxStateSave, L7) + c.out.compiledPat[topLoc+1] = op + + // Append the JMP operation. + c.appendOp(urxJmp, topLoc+1) + + // Emit the LD_SP loc + c.appendOp(urxLdSp, stoLoc) + + case doPossessiveOpt: + // Possessive ?+ quantifier. + // Compiles to + // 1. STO_SP loc + // 2. SAVE_STATE 5 + // 3. body of optional block + // 4. LD_SP loc + // 5. ... + // + // Reserve two slots at the top of the block. + topLoc := c.blockTopLoc(true) + c.insertOp(topLoc) + + // Emit the STO_SP + stoLoc := c.allocateData(1) // Reserve the data location for storing save stack ptr. + op := c.buildOp(urxStoSp, stoLoc) + c.out.compiledPat[topLoc] = op + + // Emit the SAVE_STATE + continueLoc := len(c.out.compiledPat) + 1 + op = c.buildOp(urxStateSave, continueLoc) + c.out.compiledPat[topLoc+1] = op + + // Emit the LD_SP + c.appendOp(urxLdSp, stoLoc) + + case doBeginMatchMode: + c.newModeFlags = c.modeFlags + c.setModeFlag = true + case doMatchMode: // (?i) and similar + var bit RegexpFlag + switch c.c.char { + case 0x69: /* 'i' */ + bit = CaseInsensitive + case 0x64: /* 'd' */ + bit = UnixLines + case 0x6d: /* 'm' */ + bit = Multiline + case 0x73: /* 's' */ + bit = DotAll + case 0x75: /* 'u' */ + bit = 0 /* Unicode casing */ + case 0x77: /* 'w' */ + bit = UWord + case 0x78: /* 'x' */ + bit = Comments + case 0x2d: /* '-' */ + c.setModeFlag = false + default: + // Should never happen. Other chars are filtered out by the scanner. + panic("unreachable") + } + if c.setModeFlag { + c.newModeFlags |= bit + } else { + c.newModeFlags &= ^bit + } + + case doSetMatchMode: + // Emit code to match any pending literals, using the not-yet changed match mode. + c.fixLiterals(false) + + // We've got a (?i) or similar. The match mode is being changed, but + // the change is not scoped to a parenthesized block. + if c.newModeFlags >= 0 { + panic("cNewModeFlags not properly initialized") + } + c.modeFlags = c.newModeFlags + + case doMatchModeParen: + // We've got a (?i: or similar. Begin a parenthesized block, save old + // mode flags so they can be restored at the close of the block. + // + // Compile to a + // - NOP, which later may be replaced by a save-state if the + // parenthesized group gets a * quantifier, followed by + // - NOP, which may later be replaced by a save-state if there + // is an '|' alternation within the parens. + c.fixLiterals(false) + c.appendOp(urxNop, 0) + c.appendOp(urxNop, 0) + + // On the Parentheses stack, start a new frame and add the postions + // of the two NOPs (a normal non-capturing () frame, except for the + // saving of the orignal mode flags.) + c.parenStack = append(c.parenStack, int(c.modeFlags)) + c.parenStack = append(c.parenStack, parenFlags) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-2) + c.parenStack = append(c.parenStack, len(c.out.compiledPat)-1) + + // Set the current mode flags to the new values. + if c.newModeFlags >= 0 { + panic("cNewModeFlags not properly initialized") + } + c.modeFlags = c.newModeFlags + + case doBadModeFlag: + c.error(InvalidFlag) + + case doSuppressComments: + // We have just scanned a '(?'. We now need to prevent the character scanner from + // treating a '#' as a to-the-end-of-line comment. + // (This Perl compatibility just gets uglier and uglier to do...) + c.eolComments = false + + case doSetAddAmp: + set := c.setStack[len(c.setStack)-1] + set.AddRune(chAmp) + + case doSetAddDash: + set := c.setStack[len(c.setStack)-1] + set.AddRune(chDash) + + case doSetBackslashs: + set := c.setStack[len(c.setStack)-1] + set.AddAll(staticPropertySets[urxIsspaceSet]) + + case doSetBackslashS: + sset := uset.New() + sset.AddAll(staticPropertySets[urxIsspaceSet]) // TODO: add latin1 spaces + sset.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(sset) + + case doSetBackslashd: + set := c.setStack[len(c.setStack)-1] + c.err = uprops.AddCategory(set, uchar.GcNdMask) + + case doSetBackslashD: + digits := uset.New() + c.err = uprops.ApplyIntPropertyValue(digits, uprops.UCharGeneralCategoryMask, int32(uchar.GcNdMask)) + digits.Complement() + set := c.setStack[len(c.setStack)-1] + set.AddAll(digits) + + case doSetBackslashh: + h := uset.New() + c.err = uprops.ApplyIntPropertyValue(h, uprops.UCharGeneralCategoryMask, int32(uchar.GcZsMask)) + h.AddRune(9) // Tab + + set := c.setStack[len(c.setStack)-1] + set.AddAll(h) + + case doSetBackslashH: + h := uset.New() + c.err = uprops.ApplyIntPropertyValue(h, uprops.UCharGeneralCategoryMask, int32(uchar.GcZsMask)) + h.AddRune(9) // Tab + h.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(h) + + case doSetBackslashv: + set := c.setStack[len(c.setStack)-1] + set.AddRuneRange(0x0a, 0x0d) // add range + set.AddRune(0x85) + set.AddRuneRange(0x2028, 0x2029) + + case doSetBackslashV: + v := uset.New() + v.AddRuneRange(0x0a, 0x0d) // add range + v.AddRune(0x85) + v.AddRuneRange(0x2028, 0x2029) + v.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(v) + + case doSetBackslashw: + set := c.setStack[len(c.setStack)-1] + set.AddAll(staticPropertySets[urxIswordSet]) + + case doSetBackslashW: + sset := uset.New() + sset.AddAll(staticPropertySets[urxIswordSet]) + sset.Complement() + + set := c.setStack[len(c.setStack)-1] + set.AddAll(sset) + + case doSetBegin: + c.fixLiterals(false) + c.setStack = append(c.setStack, uset.New()) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & CaseInsensitive) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetBeginDifference1: + // We have scanned something like [[abc]-[ + // Set up a new UnicodeSet for the set beginning with the just-scanned '[' + // Push a Difference operator, which will cause the new set to be subtracted from what + // went before once it is created. + c.setPushOp(setDifference1) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & CaseInsensitive) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetBeginIntersection1: + // We have scanned something like [[abc]&[ + // Need both the '&' operator and the open '[' operator. + c.setPushOp(setIntersection1) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & CaseInsensitive) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetBeginUnion: + // We have scanned something like [[abc][ + // Need to handle the union operation explicitly [[abc] | [ + c.setPushOp(setUnion) + c.setOpStack = append(c.setOpStack, setStart) + if (c.modeFlags & CaseInsensitive) != 0 { + c.setOpStack = append(c.setOpStack, setCaseClose) + } + + case doSetDifference2: + // We have scanned something like [abc-- + // Consider this to unambiguously be a set difference operator. + c.setPushOp(setDifference2) + + case doSetEnd: + // Have encountered the ']' that closes a set. + // Force the evaluation of any pending operations within this set, + // leave the completed set on the top of the set stack. + c.setEval(setEnd) + var start setOperation + start, c.setOpStack = stackPop(c.setOpStack) + if start != setStart { + panic("bad set operation in stack") + } + + case doSetFinish: + // Finished a complete set expression, including all nested sets. + // The close bracket has already triggered clearing out pending set operators, + // the operator stack should be empty and the operand stack should have just + // one entry, the result set. + if len(c.setOpStack) > 0 { + panic("expected setOpStack to be empty") + } + var set *uset.UnicodeSet + set, c.setStack = stackPop(c.setStack) + c.compileSet(set) + + case doSetIntersection2: + // Have scanned something like [abc&& + c.setPushOp(setIntersection2) + + case doSetLiteral: + // Union the just-scanned literal character into the set being built. + // This operation is the highest precedence set operation, so we can always do + // it immediately, without waiting to see what follows. It is necessary to perform + // any pending '-' or '&' operation first, because these have the same precedence + // as union-ing in a literal' + c.setEval(setUnion) + set := c.setStack[len(c.setStack)-1] + set.AddRune(c.c.char) + c.lastSetLiteral = c.c.char + + case doSetLiteralEscaped: + // A back-slash escaped literal character was encountered. + // Processing is the same as with setLiteral, above, with the addition of + // the optional check for errors on escaped ASCII letters. + if (c.modeFlags&ErrorOnUnknownEscapes) != 0 && + ((c.c.char >= 0x41 && c.c.char <= 0x5A) || // in [A-Z] + (c.c.char >= 0x61 && c.c.char <= 0x7a)) { // in [a-z] + c.error(BadEscapeSequence) + } + c.setEval(setUnion) + set := c.setStack[len(c.setStack)-1] + set.AddRune(c.c.char) + c.lastSetLiteral = c.c.char + + case doSetNamedChar: + // Scanning a \N{UNICODE CHARACTER NAME} + // Aside from the source of the character, the processing is identical to doSetLiteral, + // above. + ch := c.scanNamedChar() + c.setEval(setUnion) + set := c.setStack[len(c.setStack)-1] + set.AddRune(ch) + c.lastSetLiteral = ch + + case doSetNamedRange: + // We have scanned literal-\N{CHAR NAME}. Add the range to the set. + // The left character is already in the set, and is saved in fLastSetLiteral. + // The right side needs to be picked up, the scan is at the 'N'. + // Lower Limit > Upper limit being an error matches both Java + // and ICU UnicodeSet behavior. + ch := c.scanNamedChar() + if c.err == nil && (c.lastSetLiteral == -1 || c.lastSetLiteral > ch) { + c.error(InvalidRange) + } + set := c.setStack[len(c.setStack)-1] + set.AddRuneRange(c.lastSetLiteral, ch) + c.lastSetLiteral = ch + + case doSetNegate: + // Scanned a '^' at the start of a set. + // Push the negation operator onto the set op stack. + // A twist for case-insensitive matching: + // the case closure operation must happen _before_ negation. + // But the case closure operation will already be on the stack if it's required. + // This requires checking for case closure, and swapping the stack order + // if it is present. + tosOp := c.setOpStack[len(c.setOpStack)-1] + if tosOp == setCaseClose { + _, c.setOpStack = stackPop(c.setOpStack) + c.setOpStack = append(c.setOpStack, setNegation) + c.setOpStack = append(c.setOpStack, setCaseClose) + } else { + c.setOpStack = append(c.setOpStack, setNegation) + } + + case doSetNoCloseError: + c.error(MissingCloseBracket) + + case doSetOpError: + c.error(RuleSyntax) // -- or && at the end of a set. Illegal. + + case doSetPosixProp: + if set := c.scanPosixProp(); set != nil { + c.setStack[len(c.setStack)-1].AddAll(set) + } + + case doSetProp: + // Scanned a \p \P within [brackets]. + if set := c.scanProp(); set != nil { + c.setStack[len(c.setStack)-1].AddAll(set) + } + + case doSetRange: + // We have scanned literal-literal. Add the range to the set. + // The left character is already in the set, and is saved in fLastSetLiteral. + // The right side is the current character. + // Lower Limit > Upper limit being an error matches both Java + // and ICU UnicodeSet behavior. + + if c.lastSetLiteral == -1 || c.lastSetLiteral > c.c.char { + c.error(InvalidRange) + } + c.setStack[len(c.setStack)-1].AddRuneRange(c.lastSetLiteral, c.c.char) + + default: + panic("unexpected OP in parser") + } + + return c.err == nil +} + +func uCharDigitValue(char rune) int64 { + if char >= '0' && char <= '9' { + return int64(char - '0') + } + return -1 +} + +func stackPop[T any](stack []T) (T, []T) { + var out T + if len(stack) > 0 { + out = stack[len(stack)-1] + stack = stack[:len(stack)-1] + } + return out, stack +} + +func (c *compiler) error(e CompileErrorCode) { + c.err = &CompileError{ + Code: e, + Line: c.lineNum, + Offset: c.charNum, + Context: c.out.pattern, + } +} + +func (c *compiler) stripNOPs() { + if c.err != nil { + return + } + + end := len(c.out.compiledPat) + deltas := make([]int, 0, end) + + // Make a first pass over the code, computing the amount that things + // will be offset at each location in the original code. + var loc, d int + for loc = 0; loc < end; loc++ { + deltas = append(deltas, d) + op := c.out.compiledPat[loc] + if op.typ() == urxNop { + d++ + } + } + + // Make a second pass over the code, removing the NOPs by moving following + // code up, and patching operands that refer to code locations that + // are being moved. The array of offsets from the first step is used + // to compute the new operand values. + var src, dst int + for src = 0; src < end; src++ { + op := c.out.compiledPat[src] + opType := op.typ() + + switch opType { + case urxNop: + // skip + + case urxStateSave, + urxJmp, + utxCtrLoop, + urxCtrLoopNg, + urxRelocOprnd, + urxJmpx, + urxJmpSav, + urxJmpSavX: + // These are instructions with operands that refer to code locations. + operandAddress := op.value() + fixedOperandAddress := operandAddress - deltas[operandAddress] + op = c.buildOp(opType, fixedOperandAddress) + c.out.compiledPat[dst] = op + dst++ + + case urxBackref, urxBackrefI: + where := op.value() + if where > len(c.out.groupMap) { + c.error(InvalidBackRef) + break + } + + where = int(c.out.groupMap[where-1]) + op = c.buildOp(opType, where) + c.out.compiledPat[dst] = op + dst++ + c.out.needsAltInput = true + + case urxReservedOp, + urxReservedOpN, + urxBacktrack, + urxEnd, + urxOnechar, + urxString, + urxStringLen, + urxStartCapture, + urxEndCapture, + urxStaticSetref, + urxStatSetrefN, + urxSetref, + urxDotany, + urxFail, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashX, + urxBackslashZ, + urxDotanyAll, + urxBackslashD, + urxCaret, + urxDollar, + urxCtrInit, + urxCtrInitNg, + urxDotanyUnix, + urxStoSp, + urxLdSp, + urxStoInpLoc, + urxLaStart, + urxLaEnd, + urcOnecharI, + urxStringI, + urxDollarM, + urxCaretM, + urxCaretMUnix, + urxLbStart, + urxLbCont, + urxLbEnd, + urxLbnCount, + urxLbnEnd, + urxLoopSrI, + urxLoopDotI, + urxLoopC, + urxDollarD, + urxDollarMd, + urxBackslashH, + urxBackslashR, + urxBackslashV: + // These instructions are unaltered by the relocation. + c.out.compiledPat[dst] = op + dst++ + + default: + // Some op is unaccounted for. + panic("unreachable") + } + } + + c.out.compiledPat = c.out.compiledPat[:dst] +} + +func (c *compiler) matchStartType() { + var loc int // Location in the pattern of the current op being processed. + var currentLen int32 // Minimum length of a match to this point (loc) in the pattern + var numInitialStrings int // Number of strings encountered that could match at start. + var atStart = true // True if no part of the pattern yet encountered + // could have advanced the position in a match. + // (Maximum match length so far == 0) + + // forwardedLength is a vector holding minimum-match-length values that + // are propagated forward in the pattern by JMP or STATE_SAVE operations. + // It must be one longer than the pattern being checked because some ops + // will jmp to a end-of-block+1 location from within a block, and we must + // count those when checking the block. + end := len(c.out.compiledPat) + forwardedLength := make([]int32, end+1) + + for loc = 3; loc < end; loc++ { + forwardedLength[loc] = math.MaxInt32 + } + + for loc = 3; loc < end; loc++ { + op := c.out.compiledPat[loc] + opType := op.typ() + + // The loop is advancing linearly through the pattern. + // If the op we are now at was the destination of a branch in the pattern, + // and that path has a shorter minimum length than the current accumulated value, + // replace the current accumulated value. + if forwardedLength[loc] < currentLen { + currentLen = forwardedLength[loc] + } + + switch opType { + // Ops that don't change the total length matched + case urxReservedOp, + urxEnd, + urxFail, + urxStringLen, + urxNop, + urxStartCapture, + urxEndCapture, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashZ, + urxDollar, + urxDollarM, + urxDollarD, + urxDollarMd, + urxRelocOprnd, + urxStoInpLoc, + urxBackref, // BackRef. Must assume that it might be a zero length match + urxBackrefI, + urxStoSp, // Setup for atomic or possessive blocks. Doesn't change what can match. + urxLdSp: + // skip + + case urxCaret: + if atStart { + c.out.startType = startStart + } + + case urxCaretM, urxCaretMUnix: + if atStart { + c.out.startType = startLine + } + + case urxOnechar: + if currentLen == 0 { + // This character could appear at the start of a match. + // Add it to the set of possible starting characters. + c.out.initialChars.AddRune(op.value32()) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxSetref: + if currentLen == 0 { + sn := op.value() + set := c.out.sets[sn] + c.out.initialChars.AddAll(set) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxLoopSrI: + // [Set]*, like a SETREF, above, in what it can match, + // but may not match at all, so currentLen is not incremented. + if currentLen == 0 { + sn := op.value() + set := c.out.sets[sn] + c.out.initialChars.AddAll(set) + numInitialStrings += 2 + } + atStart = false + + case urxLoopDotI: + if currentLen == 0 { + // .* at the start of a pattern. + // Any character can begin the match. + c.out.initialChars.Clear() + c.out.initialChars.Complement() + numInitialStrings += 2 + } + atStart = false + + case urxStaticSetref: + if currentLen == 0 { + sn := op.value() + c.out.initialChars.AddAll(staticPropertySets[sn]) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxStatSetrefN: + if currentLen == 0 { + sn := op.value() + sc := uset.New() + sc.AddAll(staticPropertySets[sn]) + sc.Complement() + + c.out.initialChars.AddAll(sc) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxBackslashD: + // Digit Char + if currentLen == 0 { + s := uset.New() + c.err = uprops.ApplyIntPropertyValue(s, uprops.UCharGeneralCategoryMask, int32(uchar.GcNdMask)) + if op.value() != 0 { + s.Complement() + } + c.out.initialChars.AddAll(s) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxBackslashH: + // Horiz white space + if currentLen == 0 { + s := uset.New() + c.err = uprops.ApplyIntPropertyValue(s, uprops.UCharGeneralCategoryMask, int32(uchar.GcZsMask)) + s.AddRune(9) // Tab + if op.value() != 0 { + s.Complement() + } + c.out.initialChars.AddAll(s) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxBackslashR, // Any line ending sequence + urxBackslashV: // Any line ending code point, with optional negation + if currentLen == 0 { + s := uset.New() + s.AddRuneRange(0x0a, 0x0d) // add range + s.AddRune(0x85) + s.AddRuneRange(0x2028, 0x2029) + if op.value() != 0 { + // Complement option applies to URX_BACKSLASH_V only. + s.Complement() + } + c.out.initialChars.AddAll(s) + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urcOnecharI: + // Case Insensitive Single Character. + if currentLen == 0 { + ch := op.value32() + if uprops.HasBinaryProperty(ch, uprops.UCharCaseSensitive) { + starters := uset.New() + starters.AddRuneRange(ch, ch) + starters.CloseOver(uset.CaseInsensitive) + // findCaseInsensitiveStarters(c, &starters); + // For ONECHAR_I, no need to worry about text chars that expand on folding into + // strings. The expanded folding can't match the pattern. + c.out.initialChars.AddAll(starters) + } else { + // Char has no case variants. Just add it as-is to the + // set of possible starting chars. + c.out.initialChars.AddRune(ch) + } + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded. + urxDotanyAll, // . matches one or two. + urxDotany, + urxDotanyUnix: + if currentLen == 0 { + // These constructs are all bad news when they appear at the start + // of a match. Any character can begin the match. + c.out.initialChars.Clear() + c.out.initialChars.Complement() + numInitialStrings += 2 + } + currentLen = safeIncrement(currentLen, 1) + atStart = false + + case urxJmpx: + loc++ // Except for extra operand on URX_JMPX, same as URX_JMP. + fallthrough + + case urxJmp: + jmpDest := op.value() + if jmpDest < loc { + // Loop of some kind. Can safely ignore, the worst that will happen + // is that we understate the true minimum length + currentLen = forwardedLength[loc+1] + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + if forwardedLength[jmpDest] > currentLen { + forwardedLength[jmpDest] = currentLen + } + } + atStart = false + + case urxJmpSav, + urxJmpSavX: + // Combo of state save to the next loc, + jmp backwards. + // Net effect on min. length computation is nothing. + atStart = false + + case urxBacktrack: + // Fails are kind of like a branch, except that the min length was + // propagated already, by the state save. + currentLen = forwardedLength[loc+1] + atStart = false + + case urxStateSave: + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + jmpDest := op.value() + if jmpDest > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = (currentLen) + } + } + atStart = false + + case urxString: + loc++ + stringLenOp := c.out.compiledPat[loc] + stringLen := stringLenOp.value() + if currentLen == 0 { + // Add the starting character of this string to the set of possible starting + // characters for this pattern. + stringStartIdx := op.value() + ch := c.out.literalText[stringStartIdx] + c.out.initialChars.AddRune(ch) + + // Remember this string. After the entire pattern has been checked, + // if nothing else is identified that can start a match, we'll use it. + numInitialStrings++ + c.out.initialStringIdx = stringStartIdx + c.out.initialStringLen = stringLen + } + + currentLen = safeIncrement(currentLen, stringLen) + atStart = false + + case urxStringI: + // Case-insensitive string. Unlike exact-match strings, we won't + // attempt a string search for possible match positions. But we + // do update the set of possible starting characters. + loc++ + stringLenOp := c.out.compiledPat[loc] + stringLen := stringLenOp.value() + if currentLen == 0 { + // Add the starting character of this string to the set of possible starting + // characters for this pattern. + stringStartIdx := op.value() + ch := c.out.literalText[stringStartIdx] + s := uset.New() + c.findCaseInsensitiveStarters(ch, s) + c.out.initialChars.AddAll(s) + numInitialStrings += 2 // Matching on an initial string not possible. + } + currentLen = safeIncrement(currentLen, stringLen) + atStart = false + + case urxCtrInit, + urxCtrInitNg: + // Loop Init Ops. These don't change the min length, but they are 4 word ops + // so location must be updated accordingly. + // Loop Init Ops. + // If the min loop count == 0 + // move loc forwards to the end of the loop, skipping over the body. + // If the min count is > 0, + // continue normal processing of the body of the loop. + loopEndLoc := c.out.compiledPat[loc+1].value() + minLoopCount := int(c.out.compiledPat[loc+2]) + if minLoopCount == 0 { + // Min Loop Count of 0, treat like a forward branch and + // move the current minimum length up to the target + // (end of loop) location. + if forwardedLength[loopEndLoc] > currentLen { + forwardedLength[loopEndLoc] = currentLen + } + } + loc += 3 // Skips over operands of CTR_INIT + atStart = false + + case utxCtrLoop, + urxCtrLoopNg: + // Loop ops. + // The jump is conditional, backwards only. + atStart = false + + case urxLoopC: + // More loop ops. These state-save to themselves. + // don't change the minimum match + atStart = false + + case urxLaStart, + urxLbStart: + // Look-around. Scan forward until the matching look-ahead end, + // without processing the look-around block. This is overly pessimistic. + + // Keep track of the nesting depth of look-around blocks. Boilerplate code for + // lookahead contains two LA_END instructions, so count goes up by two + // for each LA_START. + var depth int + if opType == urxLaStart { + depth = 2 + } else { + depth = 1 + } + for { + loc++ + op = c.out.compiledPat[loc] + if op.typ() == urxLaStart { + depth += 2 + } + if op.typ() == urxLbStart { + depth++ + } + if op.typ() == urxLaEnd || op.typ() == urxLbnEnd { + depth-- + if depth == 0 { + break + } + } + if op.typ() == urxStateSave { + // Need this because neg lookahead blocks will FAIL to outside + // of the block. + jmpDest := op.value() + if jmpDest > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = (currentLen) + } + } + } + } + + case urxLaEnd, + urxLbCont, + urxLbEnd, + urxLbnCount, + urxLbnEnd: + panic("should be consumed in URX_LA_START") + + default: + panic("unreachable") + } + } + + // Sort out what we should check for when looking for candidate match start positions. + // In order of preference, + // 1. Start of input text buffer. + // 2. A literal string. + // 3. Start of line in multi-line mode. + // 4. A single literal character. + // 5. A character from a set of characters. + // + if c.out.startType == startStart { + // Match only at the start of an input text string. + // start type is already set. We're done. + } else if numInitialStrings == 1 && c.out.minMatchLen > 0 { + // Match beginning only with a literal string. + ch := c.out.literalText[c.out.initialStringIdx] + c.out.startType = startString + c.out.initialChar = ch + } else if c.out.startType == startLine { + // Match at start of line in Multi-Line mode. + // Nothing to do here; everything is already set. + } else if c.out.minMatchLen == 0 { + // Zero length match possible. We could start anywhere. + c.out.startType = startNoInfo + } else if c.out.initialChars.Len() == 1 { + // All matches begin with the same char. + c.out.startType = startChar + c.out.initialChar = c.out.initialChars.RuneAt(0) + } else if !c.out.initialChars.ContainsRuneRange(0, 0x10ffff) && c.out.minMatchLen > 0 { + // Matches start with a set of character smaller than the set of all chars. + c.out.startType = startSet + } else { + // Matches can start with anything + c.out.startType = startNoInfo + } +} + +func (c *compiler) appendOp(typ opcode, arg int) { + c.appendIns(c.buildOp(typ, arg)) +} + +func (c *compiler) appendIns(ins instruction) { + if c.err != nil { + return + } + c.out.compiledPat = append(c.out.compiledPat, ins) +} + +func (c *compiler) buildOp(typ opcode, val int) instruction { + if c.err != nil { + return 0 + } + if val > 0x00ffffff { + panic("bad argument to buildOp") + } + if val < 0 { + if !(typ == urxReservedOpN || typ == urxReservedOp) { + panic("bad value to buildOp") + } + typ = urxReservedOpN + } + return instruction(int32(typ)<<24 | int32(val)) +} + +func (c *compiler) handleCloseParen() { + if len(c.parenStack) == 0 { + c.error(MismatchedParen) + return + } + + c.fixLiterals(false) + + var patIdx int + var patOp instruction + + for { + patIdx, c.parenStack = stackPop(c.parenStack) + if patIdx < 0 { + break + } + + patOp = c.out.compiledPat[patIdx] + if patOp.value() != 0 { + panic("branch target for JMP should not be set") + } + patOp |= instruction(len(c.out.compiledPat)) + c.out.compiledPat[patIdx] = patOp + c.matchOpenParen = patIdx + } + + var modeFlags int + modeFlags, c.parenStack = stackPop(c.parenStack) + if modeFlags >= 0 { + panic("modeFlags in paren stack was not negated") + } + + c.modeFlags = RegexpFlag(modeFlags) + + switch patIdx { + case parenPlain, parenFlags: + // No additional fixups required. + // (Grouping-only parentheses) + case parenCapturing: + // Capturing Parentheses. + // Insert a End Capture op into the pattern. + // The frame offset of the variables for this cg is obtained from the + // start capture op and put it into the end-capture op. + + captureOp := c.out.compiledPat[c.matchOpenParen+1] + if captureOp.typ() != urxStartCapture { + panic("bad type in capture op (expected URX_START_CAPTURE)") + } + frameVarLocation := captureOp.value() + c.appendOp(urxEndCapture, frameVarLocation) + + case parenAtomic: + // Atomic Parenthesis. + // Insert a LD_SP operation to restore the state stack to the position + // it was when the atomic parens were entered. + stoOp := c.out.compiledPat[c.matchOpenParen+1] + if stoOp.typ() != urxStoSp { + panic("bad type in capture op (expected URX_STO_SP)") + } + stoLoc := stoOp.value() + c.appendOp(urxLdSp, stoLoc) + + case parenLookahead: + startOp := c.out.compiledPat[c.matchOpenParen-5] + if startOp.typ() != urxLaStart { + panic("bad type in capture op (expected URX_LA_START)") + } + dataLoc := startOp.value() + c.appendOp(urxLaEnd, dataLoc) + + case parenNegLookahead: + startOp := c.out.compiledPat[c.matchOpenParen-1] + if startOp.typ() != urxLaStart { + panic("bad type in capture op (expected URX_LA_START)") + } + dataLoc := startOp.value() + c.appendOp(urxLaEnd, dataLoc) + c.appendOp(urxBacktrack, 0) + c.appendOp(urxLaEnd, dataLoc) + + // Patch the URX_SAVE near the top of the block. + // The destination of the SAVE is the final LA_END that was just added. + saveOp := c.out.compiledPat[c.matchOpenParen] + if saveOp.typ() != urxStateSave { + panic("bad type in capture op (expected URX_STATE_SAVE)") + } + saveOp = c.buildOp(urxStateSave, len(c.out.compiledPat)-1) + c.out.compiledPat[c.matchOpenParen] = saveOp + + case parenLookBehind: + startOp := c.out.compiledPat[c.matchOpenParen-4] + if startOp.typ() != urxLbStart { + panic("bad type in capture op (expected URX_LB_START)") + } + dataLoc := startOp.value() + c.appendOp(urxLbEnd, dataLoc) + c.appendOp(urxLaEnd, dataLoc) + + // Determine the min and max bounds for the length of the + // string that the pattern can match. + // An unbounded upper limit is an error. + patEnd := len(c.out.compiledPat) - 1 + minML := c.minMatchLength(c.matchOpenParen, patEnd) + maxML := c.maxMatchLength(c.matchOpenParen, patEnd) + + if maxML == math.MaxInt32 { + c.error(LookBehindLimit) + break + } + if minML == math.MaxInt32 { + // This condition happens when no match is possible, such as with a + // [set] expression containing no elements. + // In principle, the generated code to evaluate the expression could be deleted, + // but it's probably not worth the complication. + minML = 0 + } + + c.out.compiledPat[c.matchOpenParen-2] = instruction(minML) + c.out.compiledPat[c.matchOpenParen-1] = instruction(maxML) + + case parenLookBehindN: + startOp := c.out.compiledPat[c.matchOpenParen-5] + if startOp.typ() != urxLbStart { + panic("bad type in capture op (expected URX_LB_START)") + } + dataLoc := startOp.value() + c.appendOp(urxLbnEnd, dataLoc) + + // Determine the min and max bounds for the length of the + // string that the pattern can match. + // An unbounded upper limit is an error. + patEnd := len(c.out.compiledPat) - 1 + minML := c.minMatchLength(c.matchOpenParen, patEnd) + maxML := c.maxMatchLength(c.matchOpenParen, patEnd) + + if instruction(maxML).typ() != 0 { + c.error(LookBehindLimit) + break + } + if maxML == math.MaxInt32 { + c.error(LookBehindLimit) + break + } + if minML == math.MaxInt32 { + // This condition happens when no match is possible, such as with a + // [set] expression containing no elements. + // In principle, the generated code to evaluate the expression could be deleted, + // but it's probably not worth the complication. + minML = 0 + } + + c.out.compiledPat[c.matchOpenParen-3] = instruction(minML) + c.out.compiledPat[c.matchOpenParen-2] = instruction(maxML) + + op := c.buildOp(urxRelocOprnd, len(c.out.compiledPat)) + c.out.compiledPat[c.matchOpenParen-1] = op + + default: + panic("unexpected opcode in parenStack") + } + + c.matchCloseParen = len(c.out.compiledPat) +} + +func (c *compiler) fixLiterals(split bool) { + if len(c.literalChars) == 0 { + return + } + + lastCodePoint := c.literalChars[len(c.literalChars)-1] + + // Split: We need to ensure that the last item in the compiled pattern + // refers only to the last literal scanned in the pattern, so that + // quantifiers (*, +, etc.) affect only it, and not a longer string. + // Split before case folding for case insensitive matches. + if split { + c.literalChars = c.literalChars[:len(c.literalChars)-1] + c.fixLiterals(false) + + c.literalChar(lastCodePoint) + c.fixLiterals(false) + return + } + + if c.modeFlags&CaseInsensitive != 0 { + c.literalChars = ucase.FoldRunes(c.literalChars) + lastCodePoint = c.literalChars[len(c.literalChars)-1] + } + + if len(c.literalChars) == 1 { + if c.modeFlags&CaseInsensitive != 0 && uprops.HasBinaryProperty(lastCodePoint, uprops.UCharCaseSensitive) { + c.appendOp(urcOnecharI, int(lastCodePoint)) + } else { + c.appendOp(urxOnechar, int(lastCodePoint)) + } + } else { + if len(c.literalChars) > 0x00ffffff || len(c.out.literalText) > 0x00ffffff { + c.error(PatternTooBig) + } + if c.modeFlags&CaseInsensitive != 0 { + c.appendOp(urxStringI, len(c.out.literalText)) + } else { + c.appendOp(urxString, len(c.out.literalText)) + } + c.appendOp(urxStringLen, len(c.literalChars)) + c.out.literalText = append(c.out.literalText, c.literalChars...) + } + + c.literalChars = c.literalChars[:0] +} + +func (c *compiler) literalChar(point rune) { + c.literalChars = append(c.literalChars, point) +} + +func (c *compiler) allocateData(size int) int { + if c.err != nil { + return 0 + } + if size <= 0 || size > 0x100 || c.out.dataSize < 0 { + c.error(InternalError) + return 0 + } + + dataIndex := c.out.dataSize + c.out.dataSize += size + if c.out.dataSize >= 0x00fffff0 { + c.error(InternalError) + } + return dataIndex +} + +func (c *compiler) allocateStackData(size int) int { + if c.err != nil { + return 0 + } + if size <= 0 || size > 0x100 || c.out.frameSize < 0 { + c.error(InternalError) + return 0 + } + dataIndex := c.out.frameSize + c.out.frameSize += size + if c.out.frameSize >= 0x00fffff0 { + c.error(InternalError) + } + return dataIndex +} + +func (c *compiler) insertOp(where int) { + if where < 0 || where >= len(c.out.compiledPat) { + panic("insertOp: out of bounds") + } + + nop := c.buildOp(urxNop, 0) + c.out.compiledPat = slices.Insert(c.out.compiledPat, where, nop) + + // Walk through the pattern, looking for any ops with targets that + // were moved down by the insert. Fix them. + for loc, op := range c.out.compiledPat { + switch op.typ() { + case urxJmp, urxJmpx, urxStateSave, utxCtrLoop, urxCtrLoopNg, urxJmpSav, urxJmpSavX, urxRelocOprnd: + if op.value() > where { + op = c.buildOp(op.typ(), op.value()+1) + c.out.compiledPat[loc] = op + } + } + } + + // Now fix up the parentheses stack. All positive values in it are locations in + // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) + for loc, x := range c.parenStack { + if x > where { + c.parenStack[loc] = x + 1 + } + } + + if c.matchCloseParen > where { + c.matchCloseParen++ + } + if c.matchOpenParen > where { + c.matchOpenParen++ + } +} + +func (c *compiler) blockTopLoc(reserve bool) int { + var loc int + c.fixLiterals(true) + + if len(c.out.compiledPat) == c.matchCloseParen { + // The item just processed is a parenthesized block. + loc = c.matchOpenParen + } else { + // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. + // No slot for STATE_SAVE was pre-reserved in the compiled code. + // We need to make space now. + loc = len(c.out.compiledPat) - 1 + op := c.out.compiledPat[loc] + if op.typ() == urxStringLen { + // Strings take two opcode, we want the position of the first one. + // We can have a string at this point if a single character case-folded to two. + loc-- + } + if reserve { + nop := c.buildOp(urxNop, 0) + c.out.compiledPat = slices.Insert(c.out.compiledPat, loc, nop) + } + } + return loc +} + +func (c *compiler) compileInlineInterval() bool { + if c.intervalUpper > 10 || c.intervalUpper < c.intervalLow { + return false + } + + topOfBlock := c.blockTopLoc(false) + if c.intervalUpper == 0 { + // Pathological case. Attempt no matches, as if the block doesn't exist. + // Discard the generated code for the block. + // If the block included parens, discard the info pertaining to them as well. + c.out.compiledPat = c.out.compiledPat[:topOfBlock] + if c.matchOpenParen >= topOfBlock { + c.matchOpenParen = -1 + } + if c.matchCloseParen >= topOfBlock { + c.matchCloseParen = -1 + } + return true + } + + if topOfBlock != len(c.out.compiledPat)-1 && c.intervalUpper != 1 { + // The thing being repeated is not a single op, but some + // more complex block. Do it as a loop, not inlines. + // Note that things "repeated" a max of once are handled as inline, because + // the one copy of the code already generated is just fine. + return false + } + + // Pick up the opcode that is to be repeated + // + op := c.out.compiledPat[topOfBlock] + + // Compute the pattern location where the inline sequence + // will end, and set up the state save op that will be needed. + // + endOfSequenceLoc := len(c.out.compiledPat) - 1 + c.intervalUpper + (c.intervalUpper - c.intervalLow) + + saveOp := c.buildOp(urxStateSave, endOfSequenceLoc) + if c.intervalLow == 0 { + c.insertOp(topOfBlock) + c.out.compiledPat[topOfBlock] = saveOp + } + + // Loop, emitting the op for the thing being repeated each time. + // Loop starts at 1 because one instance of the op already exists in the pattern, + // it was put there when it was originally encountered. + for i := 1; i < c.intervalUpper; i++ { + if i >= c.intervalLow { + c.appendIns(saveOp) + } + c.appendIns(op) + } + return true +} + +func (c *compiler) compileInterval(init opcode, loop opcode) { + // The CTR_INIT op at the top of the block with the {n,m} quantifier takes + // four slots in the compiled code. Reserve them. + topOfBlock := c.blockTopLoc(true) + c.insertOp(topOfBlock) + c.insertOp(topOfBlock) + c.insertOp(topOfBlock) + + // The operands for the CTR_INIT opcode include the index in the matcher data + // of the counter. Allocate it now. There are two data items + // counterLoc --> Loop counter + // +1 --> Input index (for breaking non-progressing loops) + // (Only present if unbounded upper limit on loop) + var dataSize int + if c.intervalUpper < 0 { + dataSize = 2 + } else { + dataSize = 1 + } + counterLoc := c.allocateStackData(dataSize) + + op := c.buildOp(init, counterLoc) + c.out.compiledPat[topOfBlock] = op + + // The second operand of CTR_INIT is the location following the end of the loop. + // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the + // compilation of something later on causes the code to grow and the target + // position to move. + loopEnd := len(c.out.compiledPat) + op = c.buildOp(urxRelocOprnd, loopEnd) + c.out.compiledPat[topOfBlock+1] = op + + // Followed by the min and max counts. + c.out.compiledPat[topOfBlock+2] = instruction(c.intervalLow) + c.out.compiledPat[topOfBlock+3] = instruction(c.intervalUpper) + + // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op. + // Goes at end of the block being looped over, so just append to the code so far. + c.appendOp(loop, topOfBlock) + + if (c.intervalLow&0xff000000) != 0 || (c.intervalUpper > 0 && (c.intervalUpper&0xff000000) != 0) { + c.error(NumberTooBig) + } + + if c.intervalLow > c.intervalUpper && c.intervalUpper != -1 { + c.error(MaxLtMin) + } +} + +func (c *compiler) scanNamedChar() rune { + c.nextChar(&c.c) + if c.c.char != chLBrace { + c.error(PropertySyntax) + return 0 + } + + var charName []rune + for { + c.nextChar(&c.c) + if c.c.char == chRBrace { + break + } + if c.c.char == -1 { + c.error(PropertySyntax) + return 0 + } + charName = append(charName, c.c.char) + } + + if !isInvariantUString(charName) { + // All Unicode character names have only invariant characters. + // The API to get a character, given a name, accepts only char *, forcing us to convert, + // which requires this error check + c.error(PropertySyntax) + return 0 + } + + theChar := unames.CharForName(unames.UnicodeCharName, string(charName)) + if c.err != nil { + c.error(PropertySyntax) + } + + c.nextChar(&c.c) // Continue overall regex pattern processing with char after the '}' + return theChar +} + +func isInvariantUString(name []rune) bool { + for _, c := range name { + /* + * no assertions here because these functions are legitimately called + * for strings with variant characters + */ + if !ucharIsInvariant(c) { + return false /* found a variant char */ + } + } + return true +} + +var invariantChars = [...]uint32{ + 0xfffffbff, /* 00..1f but not 0a */ + 0xffffffe5, /* 20..3f but not 21 23 24 */ + 0x87fffffe, /* 40..5f but not 40 5b..5e */ + 0x87fffffe, /* 60..7f but not 60 7b..7e */ +} + +func ucharIsInvariant(c rune) bool { + return c <= 0x7f && (invariantChars[(c)>>5]&(uint32(1)<<(c&0x1f))) != 0 +} + +func (c *compiler) setPushOp(op setOperation) { + c.setEval(op) + c.setOpStack = append(c.setOpStack, op) + c.setStack = append(c.setStack, uset.New()) +} + +func (c *compiler) setEval(nextOp setOperation) { + var rightOperand *uset.UnicodeSet + var leftOperand *uset.UnicodeSet + + for { + pendingSetOp := c.setOpStack[len(c.setOpStack)-1] + if (pendingSetOp & 0xffff0000) < (nextOp & 0xffff0000) { + break + } + + c.setOpStack = c.setOpStack[:len(c.setOpStack)-1] + rightOperand = c.setStack[len(c.setStack)-1] + + switch pendingSetOp { + case setNegation: + rightOperand.Complement() + + case setCaseClose: + rightOperand.CloseOver(uset.CaseInsensitive) + + case setDifference1, setDifference2: + c.setStack = c.setStack[:len(c.setStack)-1] + leftOperand = c.setStack[len(c.setStack)-1] + leftOperand.RemoveAll(rightOperand) + + case setIntersection1, setIntersection2: + c.setStack = c.setStack[:len(c.setStack)-1] + leftOperand = c.setStack[len(c.setStack)-1] + leftOperand.RetainAll(rightOperand) + + case setUnion: + c.setStack = c.setStack[:len(c.setStack)-1] + leftOperand = c.setStack[len(c.setStack)-1] + leftOperand.AddAll(rightOperand) + + default: + panic("unreachable") + } + } +} + +func safeIncrement(val int32, delta int) int32 { + if delta <= math.MaxInt32 && math.MaxInt32-val > int32(delta) { + return val + int32(delta) + } + return math.MaxInt32 +} + +func (c *compiler) minMatchLength(start, end int) int32 { + if c.err != nil { + return 0 + } + + var loc int + var currentLen int32 + + // forwardedLength is a vector holding minimum-match-length values that + // are propagated forward in the pattern by JMP or STATE_SAVE operations. + // It must be one longer than the pattern being checked because some ops + // will jmp to a end-of-block+1 location from within a block, and we must + // count those when checking the block. + forwardedLength := make([]int32, end+2) + for i := range forwardedLength { + forwardedLength[i] = math.MaxInt32 + } + + for loc = start; loc <= end; loc++ { + op := c.out.compiledPat[loc] + opType := op.typ() + + // The loop is advancing linearly through the pattern. + // If the op we are now at was the destination of a branch in the pattern, + // and that path has a shorter minimum length than the current accumulated value, + // replace the current accumulated value. + // no-match-possible cases. + if forwardedLength[loc] < currentLen { + currentLen = forwardedLength[loc] + } + + switch opType { + // Ops that don't change the total length matched + case urxReservedOp, + urxEnd, + urxStringLen, + urxNop, + urxStartCapture, + urxEndCapture, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashZ, + urxCaret, + urxDollar, + urxDollarM, + urxDollarD, + urxDollarMd, + urxRelocOprnd, + urxStoInpLoc, + urxCaretM, + urxCaretMUnix, + urxBackref, // BackRef. Must assume that it might be a zero length match + urxBackrefI, + urxStoSp, // Setup for atomic or possessive blocks. Doesn't change what can match. + urxLdSp, + urxJmpSav, + urxJmpSavX: + // no-op + + // Ops that match a minimum of one character (one or two 16 bit code units.) + // + case urxOnechar, + urxStaticSetref, + urxStatSetrefN, + urxSetref, + urxBackslashD, + urxBackslashH, + urxBackslashR, + urxBackslashV, + urcOnecharI, + urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded. + urxDotanyAll, // . matches one or two. + urxDotany, + urxDotanyUnix: + currentLen = safeIncrement(currentLen, 1) + + case urxJmpx: + loc++ // URX_JMPX has an extra operand, ignored here, otherwise processed identically to URX_JMP. + fallthrough + + case urxJmp: + jmpDest := op.value() + if jmpDest < loc { + // Loop of some kind. Can safely ignore, the worst that will happen + // is that we understate the true minimum length + currentLen = forwardedLength[loc+1] + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + if forwardedLength[jmpDest] > currentLen { + forwardedLength[jmpDest] = currentLen + } + } + + case urxBacktrack: + // Back-tracks are kind of like a branch, except that the min length was + // propagated already, by the state save. + currentLen = forwardedLength[loc+1] + + case urxStateSave: + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + jmpDest := op.value() + if jmpDest > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = currentLen + } + } + + case urxString: + loc++ + stringLenOp := c.out.compiledPat[loc] + currentLen = safeIncrement(currentLen, stringLenOp.value()) + + case urxStringI: + loc++ + // TODO: with full case folding, matching input text may be shorter than + // the string we have here. More smarts could put some bounds on it. + // Assume a min length of one for now. A min length of zero causes + // optimization failures for a pattern like "string"+ + // currentLen += URX_VAL(stringLenOp); + currentLen = safeIncrement(currentLen, 1) + + case urxCtrInit, urxCtrInitNg: + // Loop Init Ops. + // If the min loop count == 0 + // move loc forwards to the end of the loop, skipping over the body. + // If the min count is > 0, + // continue normal processing of the body of the loop. + loopEndOp := c.out.compiledPat[loc+1] + loopEndLoc := loopEndOp.value() + minLoopCount := c.out.compiledPat[loc+2] + if minLoopCount == 0 { + loc = loopEndLoc + } else { + loc += 3 // Skips over operands of CTR_INIT + } + + case utxCtrLoop, urxCtrLoopNg: + // Loop ops. The jump is conditional, backwards only. + + case urxLoopSrI, urxLoopDotI, urxLoopC: + // More loop ops. These state-save to themselves. don't change the minimum match - could match nothing at all. + + case urxLaStart, urxLbStart: + // Look-around. Scan forward until the matching look-ahead end, + // without processing the look-around block. This is overly pessimistic for look-ahead, + // it assumes that the look-ahead match might be zero-length. + // TODO: Positive lookahead could recursively do the block, then continue + // with the longer of the block or the value coming in. Ticket 6060 + var depth int32 + if opType == urxLaStart { + depth = 2 + } else { + depth = 1 + } + + for { + loc++ + op = c.out.compiledPat[loc] + if op.typ() == urxLaStart { + // The boilerplate for look-ahead includes two LA_END insturctions, + // Depth will be decremented by each one when it is seen. + depth += 2 + } + if op.typ() == urxLbStart { + depth++ + } + if op.typ() == urxLaEnd { + depth-- + if depth == 0 { + break + } + } + if op.typ() == urxLbnEnd { + depth-- + if depth == 0 { + break + } + } + if op.typ() == urxStateSave { + // Need this because neg lookahead blocks will FAIL to outside of the block. + jmpDest := op.value() + if jmpDest > loc { + if currentLen < forwardedLength[jmpDest] { + forwardedLength[jmpDest] = currentLen + } + } + } + } + + case urxLaEnd, urxLbCont, urxLbEnd, urxLbnCount, urxLbnEnd: + // Only come here if the matching URX_LA_START or URX_LB_START was not in the + // range being sized, which happens when measuring size of look-behind blocks. + + default: + panic("unreachable") + } + } + + // We have finished walking through the ops. Check whether some forward jump + // propagated a shorter length to location end+1. + if forwardedLength[end+1] < currentLen { + currentLen = forwardedLength[end+1] + } + + return currentLen +} + +func (c *compiler) maxMatchLength(start, end int) int32 { + if c.err != nil { + return 0 + } + var loc int + var currentLen int32 + + forwardedLength := make([]int32, end+1) + + for loc = start; loc <= end; loc++ { + op := c.out.compiledPat[loc] + opType := op.typ() + + // The loop is advancing linearly through the pattern. + // If the op we are now at was the destination of a branch in the pattern, + // and that path has a longer maximum length than the current accumulated value, + // replace the current accumulated value. + if forwardedLength[loc] > currentLen { + currentLen = forwardedLength[loc] + } + + switch opType { + // Ops that don't change the total length matched + case urxReservedOp, + urxEnd, + urxStringLen, + urxNop, + urxStartCapture, + urxEndCapture, + urxBackslashB, + urxBackslashBu, + urxBackslashG, + urxBackslashZ, + urxCaret, + urxDollar, + urxDollarM, + urxDollarD, + urxDollarMd, + urxRelocOprnd, + urxStoInpLoc, + urxCaretM, + urxCaretMUnix, + urxStoSp, // Setup for atomic or possessive blocks. Doesn't change what can match. + urxLdSp, + urxLbEnd, + urxLbCont, + urxLbnCount, + urxLbnEnd: + // no-op + + // Ops that increase that cause an unbounded increase in the length + // of a matched string, or that increase it a hard to characterize way. + // Call the max length unbounded, and stop further checking. + case urxBackref, // BackRef. Must assume that it might be a zero length match + urxBackrefI, + urxBackslashX: // Grahpeme Cluster. Minimum is 1, max unbounded. + currentLen = math.MaxInt32 + + // Ops that match a max of one character (possibly two 16 bit code units.) + // + case urxStaticSetref, + urxStatSetrefN, + urxSetref, + urxBackslashD, + urxBackslashH, + urxBackslashR, + urxBackslashV, + urcOnecharI, + urxDotanyAll, + urxDotany, + urxDotanyUnix: + currentLen = safeIncrement(currentLen, 2) + + // Single literal character. Increase current max length by one or two, + // depending on whether the char is in the supplementary range. + case urxOnechar: + currentLen = safeIncrement(currentLen, 1) + if op.value() > 0x10000 { + currentLen = safeIncrement(currentLen, 1) + } + + // Jumps. + // + case urxJmp, urxJmpx, urxJmpSav, urxJmpSavX: + jmpDest := op.value() + if jmpDest < loc { + // Loop of some kind. Max match length is unbounded. + currentLen = math.MaxInt32 + } else { + // Forward jump. Propagate the current min length to the target loc of the jump. + if forwardedLength[jmpDest] < currentLen { + forwardedLength[jmpDest] = currentLen + } + currentLen = 0 + } + + case urxBacktrack: + // back-tracks are kind of like a branch, except that the max length was + // propagated already, by the state save. + currentLen = forwardedLength[loc+1] + + case urxStateSave: + // State Save, for forward jumps, propagate the current minimum. + // of the state save. + // For backwards jumps, they create a loop, maximum + // match length is unbounded. + jmpDest := op.value() + if jmpDest > loc { + if currentLen > forwardedLength[jmpDest] { + forwardedLength[jmpDest] = currentLen + } + } else { + currentLen = math.MaxInt32 + } + + case urxString: + loc++ + stringLenOp := c.out.compiledPat[loc] + currentLen = safeIncrement(currentLen, stringLenOp.value()) + + case urxStringI: + // TODO: This code assumes that any user string that matches will be no longer + // than our compiled string, with case insensitive matching. + // Our compiled string has been case-folded already. + // + // Any matching user string will have no more code points than our + // compiled (folded) string. Folding may add code points, but + // not remove them. + // + // There is a potential problem if a supplemental code point + // case-folds to a BMP code point. In this case our compiled string + // could be shorter (in code units) than a matching user string. + // + // At this time (Unicode 6.1) there are no such characters, and this case + // is not being handled. A test, intltest regex/Bug9283, will fail if + // any problematic characters are added to Unicode. + // + // If this happens, we can make a set of the BMP chars that the + // troublesome supplementals fold to, scan our string, and bump the + // currentLen one extra for each that is found. + // + loc++ + stringLenOp := c.out.compiledPat[loc] + currentLen = safeIncrement(currentLen, stringLenOp.value()) + + case urxCtrInit, urxCtrInitNg: + // For Loops, recursively call this function on the pattern for the loop body, + // then multiply the result by the maximum loop count. + loopEndLoc := c.out.compiledPat[loc+1].value() + if loopEndLoc == loc+4 { + // Loop has an empty body. No affect on max match length. + // Continue processing with code after the loop end. + loc = loopEndLoc + break + } + + maxLoopCount := int(c.out.compiledPat[loc+3]) + if maxLoopCount == -1 { + // Unbounded Loop. No upper bound on match length. + currentLen = math.MaxInt32 + break + } + + blockLen := c.maxMatchLength(loc+4, loopEndLoc-1) // Recursive call. + updatedLen := int(currentLen) + int(blockLen)*maxLoopCount + if updatedLen >= math.MaxInt32 { + currentLen = math.MaxInt32 + break + } + currentLen = int32(updatedLen) + loc = loopEndLoc + + case utxCtrLoop, urxCtrLoopNg: + panic("should not encounter this opcode") + + case urxLoopSrI, urxLoopDotI, urxLoopC: + // For anything to do with loops, make the match length unbounded. + currentLen = math.MaxInt32 + + case urxLaStart, urxLaEnd: + // Look-ahead. Just ignore, treat the look-ahead block as if + // it were normal pattern. Gives a too-long match length, + // but good enough for now. + + case urxLbStart: + // Look-behind. Scan forward until the matching look-around end, + // without processing the look-behind block. + dataLoc := op.value() + for loc = loc + 1; loc <= end; loc++ { + op = c.out.compiledPat[loc] + if (op.typ() == urxLaEnd || op.typ() == urxLbnEnd) && (op.value() == dataLoc) { + break + } + } + + default: + panic("unreachable") + } + + if currentLen == math.MaxInt32 { + // The maximum length is unbounded. + // Stop further processing of the pattern. + break + } + } + + return currentLen +} + +// Machine Generated below. +// It may need updating with new versions of Unicode. +// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. +// The update tool is here: +// svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing + +// Machine Generated Data. Do not hand edit. +var reCaseFixCodePoints = [...]rune{ + 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, + 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, + 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, + 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, + 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000} + +var reCaseFixStringOffsets = [...]int16{ + 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, + 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, + 0x3d, 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, + 0x57, 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0} + +var reCaseFixCounts = [...]int16{ + 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, + 0x4, 0x1, 0x1, 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, + 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0} + +var reCaseFixData = [...]uint16{ + 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 0x1e9e, 0xfb05, + 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, + 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, + 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, + 0xfb13, 0xfb14, 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, + 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 0x1f98, 0x1f91, + 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, + 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, + 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0} + +func (c *compiler) findCaseInsensitiveStarters(ch rune, starterChars *uset.UnicodeSet) { + if uprops.HasBinaryProperty(ch, uprops.UCharCaseSensitive) { + caseFoldedC := ucase.Fold(ch) + starterChars.Clear() + starterChars.AddRune(caseFoldedC) + + var i int + for i = 0; reCaseFixCodePoints[i] < ch; i++ { + // Simple linear search through the sorted list of interesting code points. + } + + if reCaseFixCodePoints[i] == ch { + data := reCaseFixData[reCaseFixStringOffsets[i]:] + numCharsToAdd := reCaseFixCounts[i] + for j := int16(0); j < numCharsToAdd; j++ { + var cpToAdd rune + cpToAdd, data = utf16.NextUnsafe(data) + starterChars.AddRune(cpToAdd) + } + } + + starterChars.CloseOver(uset.CaseInsensitive) + } else { + // Not a cased character. Just return it alone. + starterChars.Clear() + starterChars.AddRune(ch) + } +} + +func (c *compiler) scanProp() *uset.UnicodeSet { + if c.err != nil { + return nil + } + negated := c.c.char == chP + + c.nextChar(&c.c) + if c.c.char != chLBrace { + c.error(PropertySyntax) + return nil + } + + var propertyName strings.Builder + for { + c.nextChar(&c.c) + if c.c.char == chRBrace { + break + } + if c.c.char == -1 { + c.error(PropertySyntax) + return nil + } + propertyName.WriteRune(c.c.char) + } + + ss := c.createSetForProperty(propertyName.String(), negated) + c.nextChar(&c.c) + return ss +} + +func (c *compiler) createSetForProperty(propName string, negated bool) *uset.UnicodeSet { + if c.err != nil { + return nil + } + + var set *uset.UnicodeSet + + var usetFlags uset.USet + if c.modeFlags&CaseInsensitive != 0 { + usetFlags |= uset.CaseInsensitive + } + + var err error + set, err = uprops.NewUnicodeSetFomPattern("\\p{"+propName+"}", usetFlags) + if err == nil { + goto done + } + + // + // The incoming property wasn't directly recognized by ICU. + + // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet. + // Java accepts 'word' with mixed case. + // Java accepts 'all' only in all lower case. + if strings.EqualFold(propName, "word") { + set = staticPropertySets[urxIswordSet].Clone() + goto done + } + if propName == "all" { + set = uset.New() + set.AddRuneRange(0, 0x10ffff) + goto done + } + + // Do Java InBlock expressions + // + if strings.HasPrefix(propName, "In") && len(propName) >= 3 { + set = uset.New() + if uprops.ApplyPropertyAlias(set, "Block", propName[2:]) != nil { + c.error(PropertySyntax) + } + goto done + } + + // Check for the Java form "IsBooleanPropertyValue", which we will recast + // as "BooleanPropertyValue". The property value can be either a + // a General Category or a Script Name. + if strings.HasPrefix(propName, "Is") && len(propName) >= 3 { + mPropName := propName[2:] + if strings.IndexByte(mPropName, '=') >= 0 { + c.error(PropertySyntax) + goto done + } + + if strings.EqualFold(mPropName, "assigned") { + mPropName = "unassigned" + negated = !negated + } else if strings.EqualFold(mPropName, "TitleCase") { + mPropName = "Titlecase_Letter" + } + + set, err = uprops.NewUnicodeSetFomPattern("\\p{"+mPropName+"}", 0) + if err != nil { + c.error(PropertySyntax) + } else if !set.IsEmpty() && (usetFlags&uset.CaseInsensitive) != 0 { + set.CloseOver(uset.CaseInsensitive) + } + goto done + } + + if strings.HasPrefix(propName, "java") { + set = uset.New() + + // + // Try the various Java specific properties. + // These all begin with "java" + // + if propName == "javaDefined" { + c.err = uprops.AddCategory(set, uchar.GcCnMask) + set.Complement() + } else if propName == "javaDigit" { + c.err = uprops.AddCategory(set, uchar.GcNdMask) + } else if propName == "javaIdentifierIgnorable" { + c.err = addIdentifierIgnorable(set) + } else if propName == "javaISOControl" { + set.AddRuneRange(0, 0x1F) + set.AddRuneRange(0x7F, 0x9F) + } else if propName == "javaJavaIdentifierPart" { + c.err = uprops.AddCategory(set, uchar.GcLMask) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcScMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcPcMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNdMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNlMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcMcMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcMnMask) + } + if c.err == nil { + c.err = addIdentifierIgnorable(set) + } + } else if propName == "javaJavaIdentifierStart" { + c.err = uprops.AddCategory(set, uchar.GcLMask) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNlMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcScMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcPcMask) + } + } else if propName == "javaLetter" { + c.err = uprops.AddCategory(set, uchar.GcLMask) + } else if propName == "javaLetterOrDigit" { + c.err = uprops.AddCategory(set, uchar.GcLMask) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNdMask) + } + } else if propName == "javaLowerCase" { + c.err = uprops.AddCategory(set, uchar.GcLlMask) + } else if propName == "javaMirrored" { + c.err = uprops.ApplyIntPropertyValue(set, uprops.UCharBidiMirrored, 1) + } else if propName == "javaSpaceChar" { + c.err = uprops.AddCategory(set, uchar.GcZMask) + } else if propName == "javaSupplementaryCodePoint" { + set.AddRuneRange(0x10000, uset.MaxValue) + } else if propName == "javaTitleCase" { + c.err = uprops.AddCategory(set, uchar.GcLtMask) + } else if propName == "javaUnicodeIdentifierStart" { + c.err = uprops.AddCategory(set, uchar.GcLMask) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNlMask) + } + } else if propName == "javaUnicodeIdentifierPart" { + c.err = uprops.AddCategory(set, uchar.GcLMask) + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcPcMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNdMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcNlMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcMcMask) + } + if c.err == nil { + c.err = uprops.AddCategory(set, uchar.GcMnMask) + } + if c.err == nil { + c.err = addIdentifierIgnorable(set) + } + } else if propName == "javaUpperCase" { + c.err = uprops.AddCategory(set, uchar.GcLuMask) + } else if propName == "javaValidCodePoint" { + set.AddRuneRange(0, uset.MaxValue) + } else if propName == "javaWhitespace" { + c.err = uprops.AddCategory(set, uchar.GcZMask) + excl := uset.New() + excl.AddRune(0x0a) + excl.AddRune(0x2007) + excl.AddRune(0x202f) + set.RemoveAll(excl) + set.AddRuneRange(9, 0x0d) + set.AddRuneRange(0x1c, 0x1f) + } else { + c.error(PropertySyntax) + } + + if c.err == nil && !set.IsEmpty() && (usetFlags&uset.CaseInsensitive) != 0 { + set.CloseOver(uset.CaseInsensitive) + } + goto done + } + + // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility + // extensions matched it. + c.error(PropertySyntax) + +done: + if c.err != nil { + return nil + } + if negated { + set.Complement() + } + return set +} + +func addIdentifierIgnorable(set *uset.UnicodeSet) error { + set.AddRuneRange(0, 8) + set.AddRuneRange(0x0e, 0x1b) + set.AddRuneRange(0x7f, 0x9f) + + return uprops.AddCategory(set, uchar.GcCfMask) +} + +func (c *compiler) scanPosixProp() *uset.UnicodeSet { + var set *uset.UnicodeSet + + if !(c.c.char == chColon) { + panic("assertion failed: c.lastChar == ':'") + } + + savedScanIndex := c.scanIndex + savedScanPattern := c.p + savedQuoteMode := c.quoteMode + savedInBackslashQuote := c.inBackslashQuote + savedEOLComments := c.eolComments + savedLineNum := c.lineNum + savedCharNum := c.charNum + savedLastChar := c.lastChar + savedPeekChar := c.peekChar + savedC := c.c + + // Scan for a closing ]. A little tricky because there are some perverse + // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, + // ending on the second closing ]. + var propName []rune + negated := false + + // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] + c.nextChar(&c.c) + if c.c.char == chUp { + negated = true + c.nextChar(&c.c) + } + + // Scan for the closing ":]", collecting the property name along the way. + sawPropSetTerminator := false + for { + propName = append(propName, c.c.char) + c.nextChar(&c.c) + if c.c.quoted || c.c.char == -1 { + // Escaped characters or end of input - either says this isn't a [:Property:] + break + } + if c.c.char == chColon { + c.nextChar(&c.c) + if c.c.char == chRBracket { + sawPropSetTerminator = true + break + } + } + } + + if sawPropSetTerminator { + set = c.createSetForProperty(string(propName), negated) + } else { + // No closing ']' - not a [:Property:] + // Restore the original scan position. + // The main scanner will retry the input as a normal set expression, + // not a [:Property:] expression. + c.scanIndex = savedScanIndex + c.p = savedScanPattern + c.quoteMode = savedQuoteMode + c.inBackslashQuote = savedInBackslashQuote + c.eolComments = savedEOLComments + c.lineNum = savedLineNum + c.charNum = savedCharNum + c.lastChar = savedLastChar + c.peekChar = savedPeekChar + c.c = savedC + } + + return set +} + +func (c *compiler) compileSet(set *uset.UnicodeSet) { + if set == nil { + return + } + // Remove any strings from the set. + // There shoudn't be any, but just in case. + // (Case Closure can add them; if we had a simple case closure available that + // ignored strings, that would be better.) + setSize := set.Len() + + switch setSize { + case 0: + // Set of no elements. Always fails to match. + c.appendOp(urxBacktrack, 0) + + case 1: + // The set contains only a single code point. Put it into + // the compiled pattern as a single char operation rather + // than a set, and discard the set itself. + c.literalChar(set.RuneAt(0)) + + default: + // The set contains two or more chars. (the normal case) + // Put it into the compiled pattern as a set. + // theSet->freeze(); + setNumber := len(c.out.sets) + c.out.sets = append(c.out.sets, set) + c.appendOp(urxSetref, setNumber) + } +} diff --git a/go/mysql/icuregex/compiler_table.go b/go/mysql/icuregex/compiler_table.go new file mode 100644 index 00000000000..e8cfe0d5e55 --- /dev/null +++ b/go/mysql/icuregex/compiler_table.go @@ -0,0 +1,357 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +type patternParseAction uint8 + +const ( + doSetBackslashD patternParseAction = iota + doBackslashh + doBackslashH + doSetLiteralEscaped + doOpenLookAheadNeg + doCompleteNamedBackRef + doPatStart + doBackslashS + doBackslashD + doNGStar + doNOP + doBackslashX + doSetLiteral + doContinueNamedCapture + doBackslashG + doBackslashR + doSetBegin + doSetBackslashv + doPossessivePlus + doPerlInline + doBackslashZ + doSetAddAmp + doSetBeginDifference1 + doIntervalError + doSetNegate + doIntervalInit + doSetIntersection2 + doPossessiveInterval + doRuleError + doBackslashW + doContinueNamedBackRef + doOpenNonCaptureParen + doExit + doSetNamedChar + doSetBackslashV + doConditionalExpr + doEscapeError + doBadOpenParenType + doPossessiveStar + doSetAddDash + doEscapedLiteralChar + doSetBackslashw + doIntervalUpperDigit + doBackslashv + doSetBackslashS + doSetNoCloseError + doSetProp + doBackslashB + doSetEnd + doSetRange + doMatchModeParen + doPlus + doBackslashV + doSetMatchMode + doBackslashz + doSetNamedRange + doOpenLookBehindNeg + doInterval + doBadNamedCapture + doBeginMatchMode + doBackslashd + doPatFinish + doNamedChar + doNGPlus + doSetDifference2 + doSetBackslashH + doCloseParen + doDotAny + doOpenCaptureParen + doEnterQuoteMode + doOpenAtomicParen + doBadModeFlag + doSetBackslashd + doSetFinish + doProperty + doBeginNamedBackRef + doBackRef + doOpt + doDollar + doBeginNamedCapture + doNGInterval + doSetOpError + doSetPosixProp + doSetBeginIntersection1 + doBackslashb + doSetBeginUnion + doIntevalLowerDigit + doSetBackslashh + doStar + doMatchMode + doBackslashA + doOpenLookBehind + doPossessiveOpt + doOrOperator + doBackslashw + doBackslashs + doLiteralChar + doSuppressComments + doCaret + doIntervalSame + doNGOpt + doOpenLookAhead + doSetBackslashW + doMismatchedParenErr + doSetBackslashs + rbbiLastAction +) + +// ------------------------------------------------------------------------------- +// +// RegexTableEl represents the structure of a row in the transition table +// for the pattern parser state machine. +// +// ------------------------------------------------------------------------------- +type regexTableEl struct { + action patternParseAction + charClass uint8 + nextState uint8 + pushState uint8 + nextChar bool +} + +var parseStateTable = []regexTableEl{ + {doNOP, 0, 0, 0, true}, + {doPatStart, 255, 2, 0, false}, // 1 start + {doLiteralChar, 254, 14, 0, true}, // 2 term + {doLiteralChar, 130, 14, 0, true}, // 3 + {doSetBegin, 91 /* [ */, 123, 205, true}, // 4 + {doNOP, 40 /* ( */, 27, 0, true}, // 5 + {doDotAny, 46 /* . */, 14, 0, true}, // 6 + {doCaret, 94 /* ^ */, 14, 0, true}, // 7 + {doDollar, 36 /* $ */, 14, 0, true}, // 8 + {doNOP, 92 /* \ */, 89, 0, true}, // 9 + {doOrOperator, 124 /* | */, 2, 0, true}, // 10 + {doCloseParen, 41 /* ) */, 255, 0, true}, // 11 + {doPatFinish, 253, 2, 0, false}, // 12 + {doRuleError, 255, 206, 0, false}, // 13 + {doNOP, 42 /* * */, 68, 0, true}, // 14 expr-quant + {doNOP, 43 /* + */, 71, 0, true}, // 15 + {doNOP, 63 /* ? */, 74, 0, true}, // 16 + {doIntervalInit, 123 /* { */, 77, 0, true}, // 17 + {doNOP, 40 /* ( */, 23, 0, true}, // 18 + {doNOP, 255, 20, 0, false}, // 19 + {doOrOperator, 124 /* | */, 2, 0, true}, // 20 expr-cont + {doCloseParen, 41 /* ) */, 255, 0, true}, // 21 + {doNOP, 255, 2, 0, false}, // 22 + {doSuppressComments, 63 /* ? */, 25, 0, true}, // 23 open-paren-quant + {doNOP, 255, 27, 0, false}, // 24 + {doNOP, 35 /* # */, 50, 14, true}, // 25 open-paren-quant2 + {doNOP, 255, 29, 0, false}, // 26 + {doSuppressComments, 63 /* ? */, 29, 0, true}, // 27 open-paren + {doOpenCaptureParen, 255, 2, 14, false}, // 28 + {doOpenNonCaptureParen, 58 /* : */, 2, 14, true}, // 29 open-paren-extended + {doOpenAtomicParen, 62 /* > */, 2, 14, true}, // 30 + {doOpenLookAhead, 61 /* = */, 2, 20, true}, // 31 + {doOpenLookAheadNeg, 33 /* ! */, 2, 20, true}, // 32 + {doNOP, 60 /* < */, 46, 0, true}, // 33 + {doNOP, 35 /* # */, 50, 2, true}, // 34 + {doBeginMatchMode, 105 /* i */, 53, 0, false}, // 35 + {doBeginMatchMode, 100 /* d */, 53, 0, false}, // 36 + {doBeginMatchMode, 109 /* m */, 53, 0, false}, // 37 + {doBeginMatchMode, 115 /* s */, 53, 0, false}, // 38 + {doBeginMatchMode, 117 /* u */, 53, 0, false}, // 39 + {doBeginMatchMode, 119 /* w */, 53, 0, false}, // 40 + {doBeginMatchMode, 120 /* x */, 53, 0, false}, // 41 + {doBeginMatchMode, 45 /* - */, 53, 0, false}, // 42 + {doConditionalExpr, 40 /* ( */, 206, 0, true}, // 43 + {doPerlInline, 123 /* { */, 206, 0, true}, // 44 + {doBadOpenParenType, 255, 206, 0, false}, // 45 + {doOpenLookBehind, 61 /* = */, 2, 20, true}, // 46 open-paren-lookbehind + {doOpenLookBehindNeg, 33 /* ! */, 2, 20, true}, // 47 + {doBeginNamedCapture, 129, 64, 0, false}, // 48 + {doBadOpenParenType, 255, 206, 0, false}, // 49 + {doNOP, 41 /* ) */, 255, 0, true}, // 50 paren-comment + {doMismatchedParenErr, 253, 206, 0, false}, // 51 + {doNOP, 255, 50, 0, true}, // 52 + {doMatchMode, 105 /* i */, 53, 0, true}, // 53 paren-flag + {doMatchMode, 100 /* d */, 53, 0, true}, // 54 + {doMatchMode, 109 /* m */, 53, 0, true}, // 55 + {doMatchMode, 115 /* s */, 53, 0, true}, // 56 + {doMatchMode, 117 /* u */, 53, 0, true}, // 57 + {doMatchMode, 119 /* w */, 53, 0, true}, // 58 + {doMatchMode, 120 /* x */, 53, 0, true}, // 59 + {doMatchMode, 45 /* - */, 53, 0, true}, // 60 + {doSetMatchMode, 41 /* ) */, 2, 0, true}, // 61 + {doMatchModeParen, 58 /* : */, 2, 14, true}, // 62 + {doBadModeFlag, 255, 206, 0, false}, // 63 + {doContinueNamedCapture, 129, 64, 0, true}, // 64 named-capture + {doContinueNamedCapture, 128, 64, 0, true}, // 65 + {doOpenCaptureParen, 62 /* > */, 2, 14, true}, // 66 + {doBadNamedCapture, 255, 206, 0, false}, // 67 + {doNGStar, 63 /* ? */, 20, 0, true}, // 68 quant-star + {doPossessiveStar, 43 /* + */, 20, 0, true}, // 69 + {doStar, 255, 20, 0, false}, // 70 + {doNGPlus, 63 /* ? */, 20, 0, true}, // 71 quant-plus + {doPossessivePlus, 43 /* + */, 20, 0, true}, // 72 + {doPlus, 255, 20, 0, false}, // 73 + {doNGOpt, 63 /* ? */, 20, 0, true}, // 74 quant-opt + {doPossessiveOpt, 43 /* + */, 20, 0, true}, // 75 + {doOpt, 255, 20, 0, false}, // 76 + {doNOP, 128, 79, 0, false}, // 77 interval-open + {doIntervalError, 255, 206, 0, false}, // 78 + {doIntevalLowerDigit, 128, 79, 0, true}, // 79 interval-lower + {doNOP, 44 /* , */, 83, 0, true}, // 80 + {doIntervalSame, 125 /* } */, 86, 0, true}, // 81 + {doIntervalError, 255, 206, 0, false}, // 82 + {doIntervalUpperDigit, 128, 83, 0, true}, // 83 interval-upper + {doNOP, 125 /* } */, 86, 0, true}, // 84 + {doIntervalError, 255, 206, 0, false}, // 85 + {doNGInterval, 63 /* ? */, 20, 0, true}, // 86 interval-type + {doPossessiveInterval, 43 /* + */, 20, 0, true}, // 87 + {doInterval, 255, 20, 0, false}, // 88 + {doBackslashA, 65 /* A */, 2, 0, true}, // 89 backslash + {doBackslashB, 66 /* B */, 2, 0, true}, // 90 + {doBackslashb, 98 /* b */, 2, 0, true}, // 91 + {doBackslashd, 100 /* d */, 14, 0, true}, // 92 + {doBackslashD, 68 /* D */, 14, 0, true}, // 93 + {doBackslashG, 71 /* G */, 2, 0, true}, // 94 + {doBackslashh, 104 /* h */, 14, 0, true}, // 95 + {doBackslashH, 72 /* H */, 14, 0, true}, // 96 + {doNOP, 107 /* k */, 115, 0, true}, // 97 + {doNamedChar, 78 /* N */, 14, 0, false}, // 98 + {doProperty, 112 /* p */, 14, 0, false}, // 99 + {doProperty, 80 /* P */, 14, 0, false}, // 100 + {doBackslashR, 82 /* R */, 14, 0, true}, // 101 + {doEnterQuoteMode, 81 /* Q */, 2, 0, true}, // 102 + {doBackslashS, 83 /* S */, 14, 0, true}, // 103 + {doBackslashs, 115 /* s */, 14, 0, true}, // 104 + {doBackslashv, 118 /* v */, 14, 0, true}, // 105 + {doBackslashV, 86 /* V */, 14, 0, true}, // 106 + {doBackslashW, 87 /* W */, 14, 0, true}, // 107 + {doBackslashw, 119 /* w */, 14, 0, true}, // 108 + {doBackslashX, 88 /* X */, 14, 0, true}, // 109 + {doBackslashZ, 90 /* Z */, 2, 0, true}, // 110 + {doBackslashz, 122 /* z */, 2, 0, true}, // 111 + {doBackRef, 128, 14, 0, true}, // 112 + {doEscapeError, 253, 206, 0, false}, // 113 + {doEscapedLiteralChar, 255, 14, 0, true}, // 114 + {doBeginNamedBackRef, 60 /* < */, 117, 0, true}, // 115 named-backref + {doBadNamedCapture, 255, 206, 0, false}, // 116 + {doContinueNamedBackRef, 129, 119, 0, true}, // 117 named-backref-2 + {doBadNamedCapture, 255, 206, 0, false}, // 118 + {doContinueNamedBackRef, 129, 119, 0, true}, // 119 named-backref-3 + {doContinueNamedBackRef, 128, 119, 0, true}, // 120 + {doCompleteNamedBackRef, 62 /* > */, 14, 0, true}, // 121 + {doBadNamedCapture, 255, 206, 0, false}, // 122 + {doSetNegate, 94 /* ^ */, 126, 0, true}, // 123 set-open + {doSetPosixProp, 58 /* : */, 128, 0, false}, // 124 + {doNOP, 255, 126, 0, false}, // 125 + {doSetLiteral, 93 /* ] */, 141, 0, true}, // 126 set-open2 + {doNOP, 255, 131, 0, false}, // 127 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 128 set-posix + {doNOP, 58 /* : */, 131, 0, false}, // 129 + {doRuleError, 255, 206, 0, false}, // 130 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 131 set-start + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 132 + {doNOP, 92 /* \ */, 191, 0, true}, // 133 + {doNOP, 45 /* - */, 137, 0, true}, // 134 + {doNOP, 38 /* & */, 139, 0, true}, // 135 + {doSetLiteral, 255, 141, 0, true}, // 136 + {doRuleError, 45 /* - */, 206, 0, false}, // 137 set-start-dash + {doSetAddDash, 255, 141, 0, false}, // 138 + {doRuleError, 38 /* & */, 206, 0, false}, // 139 set-start-amp + {doSetAddAmp, 255, 141, 0, false}, // 140 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 141 set-after-lit + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 142 + {doNOP, 45 /* - */, 178, 0, true}, // 143 + {doNOP, 38 /* & */, 169, 0, true}, // 144 + {doNOP, 92 /* \ */, 191, 0, true}, // 145 + {doSetNoCloseError, 253, 206, 0, false}, // 146 + {doSetLiteral, 255, 141, 0, true}, // 147 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 148 set-after-set + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 149 + {doNOP, 45 /* - */, 171, 0, true}, // 150 + {doNOP, 38 /* & */, 166, 0, true}, // 151 + {doNOP, 92 /* \ */, 191, 0, true}, // 152 + {doSetNoCloseError, 253, 206, 0, false}, // 153 + {doSetLiteral, 255, 141, 0, true}, // 154 + {doSetEnd, 93 /* ] */, 255, 0, true}, // 155 set-after-range + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 156 + {doNOP, 45 /* - */, 174, 0, true}, // 157 + {doNOP, 38 /* & */, 176, 0, true}, // 158 + {doNOP, 92 /* \ */, 191, 0, true}, // 159 + {doSetNoCloseError, 253, 206, 0, false}, // 160 + {doSetLiteral, 255, 141, 0, true}, // 161 + {doSetBeginUnion, 91 /* [ */, 123, 148, true}, // 162 set-after-op + {doSetOpError, 93 /* ] */, 206, 0, false}, // 163 + {doNOP, 92 /* \ */, 191, 0, true}, // 164 + {doSetLiteral, 255, 141, 0, true}, // 165 + {doSetBeginIntersection1, 91 /* [ */, 123, 148, true}, // 166 set-set-amp + {doSetIntersection2, 38 /* & */, 162, 0, true}, // 167 + {doSetAddAmp, 255, 141, 0, false}, // 168 + {doSetIntersection2, 38 /* & */, 162, 0, true}, // 169 set-lit-amp + {doSetAddAmp, 255, 141, 0, false}, // 170 + {doSetBeginDifference1, 91 /* [ */, 123, 148, true}, // 171 set-set-dash + {doSetDifference2, 45 /* - */, 162, 0, true}, // 172 + {doSetAddDash, 255, 141, 0, false}, // 173 + {doSetDifference2, 45 /* - */, 162, 0, true}, // 174 set-range-dash + {doSetAddDash, 255, 141, 0, false}, // 175 + {doSetIntersection2, 38 /* & */, 162, 0, true}, // 176 set-range-amp + {doSetAddAmp, 255, 141, 0, false}, // 177 + {doSetDifference2, 45 /* - */, 162, 0, true}, // 178 set-lit-dash + {doSetAddDash, 91 /* [ */, 141, 0, false}, // 179 + {doSetAddDash, 93 /* ] */, 141, 0, false}, // 180 + {doNOP, 92 /* \ */, 183, 0, true}, // 181 + {doSetRange, 255, 155, 0, true}, // 182 + {doSetOpError, 115 /* s */, 206, 0, false}, // 183 set-lit-dash-escape + {doSetOpError, 83 /* S */, 206, 0, false}, // 184 + {doSetOpError, 119 /* w */, 206, 0, false}, // 185 + {doSetOpError, 87 /* W */, 206, 0, false}, // 186 + {doSetOpError, 100 /* d */, 206, 0, false}, // 187 + {doSetOpError, 68 /* D */, 206, 0, false}, // 188 + {doSetNamedRange, 78 /* N */, 155, 0, false}, // 189 + {doSetRange, 255, 155, 0, true}, // 190 + {doSetProp, 112 /* p */, 148, 0, false}, // 191 set-escape + {doSetProp, 80 /* P */, 148, 0, false}, // 192 + {doSetNamedChar, 78 /* N */, 141, 0, false}, // 193 + {doSetBackslashs, 115 /* s */, 155, 0, true}, // 194 + {doSetBackslashS, 83 /* S */, 155, 0, true}, // 195 + {doSetBackslashw, 119 /* w */, 155, 0, true}, // 196 + {doSetBackslashW, 87 /* W */, 155, 0, true}, // 197 + {doSetBackslashd, 100 /* d */, 155, 0, true}, // 198 + {doSetBackslashD, 68 /* D */, 155, 0, true}, // 199 + {doSetBackslashh, 104 /* h */, 155, 0, true}, // 200 + {doSetBackslashH, 72 /* H */, 155, 0, true}, // 201 + {doSetBackslashv, 118 /* v */, 155, 0, true}, // 202 + {doSetBackslashV, 86 /* V */, 155, 0, true}, // 203 + {doSetLiteralEscaped, 255, 141, 0, true}, // 204 + {doSetFinish, 255, 14, 0, false}, // 205 set-finish + {doExit, 255, 206, 0, true}, // 206 errorDeath +} diff --git a/go/mysql/icuregex/debug.go b/go/mysql/icuregex/debug.go new file mode 100644 index 00000000000..92c43e704d7 --- /dev/null +++ b/go/mysql/icuregex/debug.go @@ -0,0 +1,151 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "io" +) + +func (pat *Pattern) Dump(w io.Writer) { + fmt.Fprintf(w, "Original Pattern: \"%s\"\n", pat.pattern) + fmt.Fprintf(w, " Min Match Length: %d\n", pat.minMatchLen) + fmt.Fprintf(w, " Match Start Type: %v\n", pat.startType) + if pat.startType == startString { + fmt.Fprintf(w, " Initial match string: \"%s\"\n", string(pat.literalText[pat.initialStringIdx:pat.initialStringIdx+pat.initialStringLen])) + } else if pat.startType == startSet { + fmt.Fprintf(w, " Match First Chars: %s\n", pat.initialChars.String()) + } else if pat.startType == startChar { + fmt.Fprintf(w, " First char of Match: ") + if pat.initialChar > 0x20 { + fmt.Fprintf(w, "'%c'\n", pat.initialChar) + } else { + fmt.Fprintf(w, "%#x\n", pat.initialChar) + } + } + + fmt.Fprintf(w, "Named Capture Groups:\n") + if len(pat.namedCaptureMap) == 0 { + fmt.Fprintf(w, " None\n") + } else { + for name, number := range pat.namedCaptureMap { + fmt.Fprintf(w, " %d\t%s\n", number, name) + } + } + + fmt.Fprintf(w, "\nIndex Binary Type Operand\n-------------------------------------------\n") + for idx := range pat.compiledPat { + pat.dumpOp(w, idx) + } + fmt.Fprintf(w, "\n\n") +} + +func (pat *Pattern) dumpOp(w io.Writer, index int) { + op := pat.compiledPat[index] + val := op.value() + opType := op.typ() + pinnedType := opType + if int(pinnedType) >= len(urxOpcodeNames) { + pinnedType = 0 + } + + fmt.Fprintf(w, "%4d %08x %-15s ", index, op, urxOpcodeNames[pinnedType]) + + switch opType { + case urxNop, + urxDotany, + urxDotanyAll, + urxFail, + urxCaret, + urxDollar, + urxBackslashG, + urxBackslashX, + urxEnd, + urxDollarM, + urxCaretM: + // Types with no operand field of interest. + + case urxReservedOp, + urxStartCapture, + urxEndCapture, + urxStateSave, + urxJmp, + urxJmpSav, + urxJmpSavX, + urxBackslashB, + urxBackslashBu, + urxBackslashD, + urxBackslashZ, + urxStringLen, + urxCtrInit, + urxCtrInitNg, + utxCtrLoop, + urxCtrLoopNg, + urxRelocOprnd, + urxStoSp, + urxLdSp, + urxBackref, + urxStoInpLoc, + urxJmpx, + urxLaStart, + urxLaEnd, + urxBackrefI, + urxLbStart, + urxLbCont, + urxLbEnd, + urxLbnCount, + urxLbnEnd, + urxLoopC, + urxLoopDotI, + urxBackslashH, + urxBackslashR, + urxBackslashV: + // types with an integer operand field. + fmt.Fprintf(w, "%d", val) + + case urxOnechar, urcOnecharI: + if val < 0x20 { + fmt.Fprintf(w, "%#x", val) + } else { + fmt.Fprintf(w, "'%c'", rune(val)) + } + + case urxString, urxStringI: + lengthOp := pat.compiledPat[index+1] + length := lengthOp.value() + fmt.Fprintf(w, "%q", string(pat.literalText[val:val+length])) + + case urxSetref, urxLoopSrI: + fmt.Fprintf(w, "%s", pat.sets[val].String()) + + case urxStaticSetref, urxStatSetrefN: + if (val & urxNegSet) != 0 { + fmt.Fprintf(w, "NOT ") + val &= ^urxNegSet + } + fmt.Fprintf(w, "%s", staticPropertySets[val].String()) + + default: + fmt.Fprintf(w, "??????") + } + fmt.Fprintf(w, "\n") +} diff --git a/go/mysql/icuregex/error.go b/go/mysql/icuregex/error.go new file mode 100644 index 00000000000..219ddcf602b --- /dev/null +++ b/go/mysql/icuregex/error.go @@ -0,0 +1,149 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "strings" +) + +type CompileError struct { + Code CompileErrorCode + Line int + Offset int + Context string +} + +func (e *CompileError) Error() string { + var out strings.Builder + switch e.Code { + case InternalError: + out.WriteString("Internal error") + case RuleSyntax: + out.WriteString("Syntax error") + case BadEscapeSequence: + out.WriteString("Bad escape sequence") + case PropertySyntax: + out.WriteString("Property syntax error") + case Unimplemented: + out.WriteString("Unimplemented") + case MismatchedParen: + out.WriteString("Mismatched parentheses") + case NumberTooBig: + out.WriteString("Number too big") + case BadInterval: + out.WriteString("Bad interval") + case MaxLtMin: + out.WriteString("Max less than min") + case InvalidBackRef: + out.WriteString("Invalid back reference") + case InvalidFlag: + out.WriteString("Invalid flag") + case LookBehindLimit: + out.WriteString("Look behind limit") + case MissingCloseBracket: + out.WriteString("Missing closing ]") + case InvalidRange: + out.WriteString("Invalid range") + case PatternTooBig: + out.WriteString("Pattern too big") + case InvalidCaptureGroupName: + out.WriteString("Invalid capture group name") + } + _, _ = fmt.Fprintf(&out, " in regular expression on line %d, character %d: `%s`", e.Line, e.Offset, e.Context) + + return out.String() +} + +type MatchError struct { + Code MatchErrorCode + Pattern string + Position int + Input []rune +} + +const maxMatchInputLength = 20 + +func (e *MatchError) Error() string { + var out strings.Builder + switch e.Code { + case StackOverflow: + out.WriteString("Stack overflow") + case TimeOut: + out.WriteString("Timeout") + } + + input := e.Input + if len(input) > maxMatchInputLength { + var b []rune + start := e.Position - maxMatchInputLength/2 + if start < 0 { + start = 0 + } else { + b = append(b, '.', '.', '.') + } + end := start + maxMatchInputLength + trailing := true + if end > len(input) { + end = len(input) + trailing = false + } + b = append(b, input[start:end]...) + if trailing { + b = append(b, '.', '.', '.') + } + input = b + } + _, _ = fmt.Fprintf(&out, " for expression `%s` at position %d in: %q", e.Pattern, e.Position, string(input)) + + return out.String() +} + +type Code int32 + +type CompileErrorCode int32 + +const ( + InternalError CompileErrorCode = iota + 1 /**< An internal error (bug) was detected. */ + RuleSyntax /**< Syntax error in regexp pattern. */ + BadEscapeSequence /**< Unrecognized backslash escape sequence in pattern */ + PropertySyntax /**< Incorrect Unicode property */ + Unimplemented /**< Use of regexp feature that is not yet implemented. */ + MismatchedParen /**< Incorrectly nested parentheses in regexp pattern. */ + NumberTooBig /**< Decimal number is too large. */ + BadInterval /**< Error in {min,max} interval */ + MaxLtMin /**< In {min,max}, max is less than min. */ + InvalidBackRef /**< Back-reference to a non-existent capture group. */ + InvalidFlag /**< Invalid value for match mode flags. */ + LookBehindLimit /**< Look-Behind pattern matches must have a bounded maximum length. */ + MissingCloseBracket /**< Missing closing bracket on a bracket expression. */ + InvalidRange /**< In a character range [x-y], x is greater than y. */ + PatternTooBig /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */ + InvalidCaptureGroupName /**< Invalid capture group name. @stable ICU 55 */ +) + +type MatchErrorCode int32 + +const ( + StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */ + TimeOut /**< Maximum allowed match time exceeded */ +) diff --git a/go/mysql/icuregex/errors/error.go b/go/mysql/icuregex/errors/error.go new file mode 100644 index 00000000000..f03a5157acf --- /dev/null +++ b/go/mysql/icuregex/errors/error.go @@ -0,0 +1,27 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package errors + +import "errors" + +var ErrIllegalArgument = errors.New("illegal argument") +var ErrUnsupported = errors.New("unsupported") diff --git a/go/mysql/icuregex/icu_test.go b/go/mysql/icuregex/icu_test.go new file mode 100644 index 00000000000..42c98dde5db --- /dev/null +++ b/go/mysql/icuregex/icu_test.go @@ -0,0 +1,422 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex_test + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "regexp" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/icuregex" + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" +) + +var ErrSkip = errors.New("ignored test") + +type Matcher int8 + +const ( + FuncFind Matcher = iota + FuncMatches + FuncLookingAt +) + +type Expectation int8 + +const ( + Unknown Expectation = iota + Expected + NotExpected +) + +type TestPattern struct { + Line string + Lineno int + + Pattern string + Flags icuregex.RegexpFlag + Options struct { + MatchFunc Matcher + FindCount int + MatchOnly bool + MustError bool + Dump bool + HitEnd Expectation + RequireEnd Expectation + } + Input string + Groups []TestGroup +} + +type TestGroup struct { + Start, End int +} + +var parsePattern = regexp.MustCompile(`<(/?)(r|[0-9]+)>`) + +func (tp *TestPattern) parseFlags(line string) (string, error) { + for len(line) > 0 { + switch line[0] { + case '"', '\'', '/': + return line, nil + case ' ', '\t': + case 'i': + tp.Flags |= icuregex.CaseInsensitive + case 'x': + tp.Flags |= icuregex.Comments + case 's': + tp.Flags |= icuregex.DotAll + case 'm': + tp.Flags |= icuregex.Multiline + case 'e': + tp.Flags |= icuregex.ErrorOnUnknownEscapes + case 'D': + tp.Flags |= icuregex.UnixLines + case 'Q': + tp.Flags |= icuregex.Literal + case '2', '3', '4', '5', '6', '7', '8', '9': + tp.Options.FindCount = int(line[0] - '0') + case 'G': + tp.Options.MatchOnly = true + case 'E': + tp.Options.MustError = true + case 'd': + tp.Options.Dump = true + case 'L': + tp.Options.MatchFunc = FuncLookingAt + case 'M': + tp.Options.MatchFunc = FuncMatches + case 'v': + tp.Options.MustError = !icuregex.BreakIteration + case 'a', 'b': + return "", ErrSkip + case 'z': + tp.Options.HitEnd = Expected + case 'Z': + tp.Options.HitEnd = NotExpected + case 'y': + tp.Options.RequireEnd = Expected + case 'Y': + tp.Options.RequireEnd = NotExpected + default: + return "", fmt.Errorf("unexpected modifier '%c'", line[0]) + } + line = line[1:] + } + return "", io.ErrUnexpectedEOF +} + +func (tp *TestPattern) parseMatch(orig string) error { + input, ok := pattern.Unescape(orig) + if !ok { + return fmt.Errorf("failed to unquote input: %s", orig) + } + + var detagged []rune + var last int + + m := parsePattern.FindAllStringSubmatchIndex(input, -1) + for _, g := range m { + detagged = append(detagged, []rune(input[last:g[0]])...) + last = g[1] + + closing := input[g[2]:g[3]] == "/" + groupNum := input[g[4]:g[5]] + if groupNum == "r" { + return ErrSkip + } + num, err := strconv.Atoi(groupNum) + if err != nil { + return fmt.Errorf("bad group number %q: %w", groupNum, err) + } + + if num >= len(tp.Groups) { + grp := make([]TestGroup, num+1) + for i := range grp { + grp[i].Start = -1 + grp[i].End = -1 + } + copy(grp, tp.Groups) + tp.Groups = grp + } + + if closing { + tp.Groups[num].End = len(detagged) + } else { + tp.Groups[num].Start = len(detagged) + } + } + + detagged = append(detagged, []rune(input[last:])...) + tp.Input = string(detagged) + return nil +} + +func ParseTestFile(t testing.TB, filename string) []TestPattern { + f, err := os.Open(filename) + if err != nil { + t.Fatalf("failed to open test data: %v", err) + } + + defer f.Close() + scanner := bufio.NewScanner(f) + var lineno int + var patterns []TestPattern + + errFunc := func(err error) { + if err == ErrSkip { + return + } + t.Errorf("Parse error: %v\n%03d: %s", err, lineno, scanner.Text()) + } + + for scanner.Scan() { + lineno++ + line := scanner.Text() + line = strings.TrimSpace(line) + + if len(line) == 0 || line[0] == '#' { + continue + } + + var tp TestPattern + tp.Line = line + tp.Lineno = lineno + + idx := strings.IndexByte(line[1:], line[0]) + + tp.Pattern = line[1 : idx+1] + line, err = tp.parseFlags(line[idx+2:]) + if err != nil { + errFunc(err) + continue + } + + idx = strings.IndexByte(line[1:], line[0]) + err = tp.parseMatch(line[1 : idx+1]) + if err != nil { + errFunc(err) + continue + } + + patterns = append(patterns, tp) + } + + if err := scanner.Err(); err != nil { + t.Fatal(err) + } + return patterns +} + +func (tp *TestPattern) fail(t testing.TB, msg string, args ...any) bool { + t.Helper() + msg = fmt.Sprintf(msg, args...) + t.Errorf("%s (in line %d)\nregexp: %s\ninput: %q\noriginal: %s", msg, tp.Lineno, tp.Pattern, tp.Input, tp.Line) + return false +} + +func (tp *TestPattern) Test(t testing.TB) bool { + re, err := func() (re *icuregex.Pattern, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("PANIC: %v", r) + } + }() + re, err = icuregex.CompileString(tp.Pattern, tp.Flags) + return + }() + if err != nil { + if tp.Options.MustError { + return true + } + + return tp.fail(t, "unexpected parser failure: %v", err) + } + if tp.Options.MustError { + return tp.fail(t, "parse failure expected") + } + + matcher := re.Match(tp.Input) + var isMatch bool + var findCount = tp.Options.FindCount + if findCount == 0 { + findCount = 1 + } + + for i := 0; i < findCount; i++ { + isMatch, err = func() (bool, error) { + defer func() { + if r := recover(); r != nil { + tp.fail(t, "unexpected match failure: %v", r) + } + }() + switch tp.Options.MatchFunc { + case FuncMatches: + return matcher.Matches() + case FuncLookingAt: + return matcher.LookingAt() + case FuncFind: + return matcher.Find() + default: + panic("invalid MatchFunc") + } + }() + } + + require.NoError(t, err) + + if !isMatch && len(tp.Groups) > 0 { + return tp.fail(t, "Match expected, but none found.") + } + if isMatch && len(tp.Groups) == 0 { + return tp.fail(t, "No match expected, but found one at position %d", matcher.Start()) + } + if tp.Options.MatchOnly { + return true + } + + for i := 0; i < matcher.GroupCount(); i++ { + expectedStart := -1 + expectedEnd := -1 + + if i < len(tp.Groups) { + expectedStart = tp.Groups[i].Start + expectedEnd = tp.Groups[i].End + } + if gotStart := matcher.StartForGroup(i); gotStart != expectedStart { + return tp.fail(t, "Incorrect start position for group %d. Expected %d, got %d", i, expectedStart, gotStart) + } + if gotEnd := matcher.EndForGroup(i); gotEnd != expectedEnd { + return tp.fail(t, "Incorrect end position for group %d. Expected %d, got %d", i, expectedEnd, gotEnd) + } + } + + if matcher.GroupCount()+1 < len(tp.Groups) { + return tp.fail(t, "Expected %d capture groups, found %d", len(tp.Groups)-1, matcher.GroupCount()) + } + + if tp.Options.HitEnd == Expected && !matcher.HitEnd() { + return tp.fail(t, "HitEnd() returned false. Expected true") + } + if tp.Options.HitEnd == NotExpected && matcher.HitEnd() { + return tp.fail(t, "HitEnd() returned true. Expected false") + } + + if tp.Options.RequireEnd == Expected && !matcher.RequireEnd() { + return tp.fail(t, "RequireEnd() returned false. Expected true") + } + if tp.Options.RequireEnd == NotExpected && matcher.RequireEnd() { + return tp.fail(t, "RequireEnd() returned true. Expected false") + } + + return true +} + +func TestICU(t *testing.T) { + pats := ParseTestFile(t, "testdata/regextst.txt") + + var valid int + + for _, p := range pats { + if p.Test(t) { + valid++ + } + } + + t.Logf("%d/%d (%.02f)", valid, len(pats), float64(valid)/float64(len(pats))) +} + +func TestICUExtended(t *testing.T) { + // This tests additional cases that aren't covered in the + // copied ICU test suite. + pats := ParseTestFile(t, "testdata/regextst_extended.txt") + + var valid int + + for _, p := range pats { + if p.Test(t) { + valid++ + } + } + + t.Logf("%d/%d (%.02f)", valid, len(pats), float64(valid)/float64(len(pats))) +} + +func TestCornerCases(t *testing.T) { + var cases = []struct { + Pattern string + Input string + Flags icuregex.RegexpFlag + Match bool + }{ + {`xyz$`, "xyz\n", 0, true}, + {`a*+`, "abbxx", 0, true}, + {`(ABC){1,2}+ABC`, "ABCABCABC", 0, true}, + {`(ABC){2,3}+ABC`, "ABCABCABC", 0, false}, + {`(abc)*+a`, "abcabcabc", 0, false}, + {`(abc)*+a`, "abcabcab", 0, true}, + {`a\N{LATIN SMALL LETTER B}c`, "abc", 0, true}, + {`a.b`, "a\rb", icuregex.UnixLines, true}, + {`a.b`, "a\rb", 0, false}, + {`(?d)abc$`, "abc\r", 0, false}, + {`[ \b]`, "b", 0, true}, + {`[abcd-\N{LATIN SMALL LETTER G}]+`, "xyz-abcdefghij-", 0, true}, + {`[[abcd]&&[ac]]+`, "bacacd", 0, true}, + } + + for _, tc := range cases { + t.Run(tc.Pattern, func(t *testing.T) { + _, err := icuregex.CompileString(tc.Pattern, tc.Flags) + if err != nil { + t.Fatal(err) + } + }) + } +} + +func TestOne(t *testing.T) { + const Pattern = `\p{CaseIgnorable}` + const Input = "foo.bar" + const Flags = 0 + + re, err := icuregex.CompileString(Pattern, Flags) + if err != nil { + t.Fatalf("compilation failed: %v", err) + } + + re.Dump(os.Stderr) + + m := icuregex.NewMatcher(re) + m.Dumper(os.Stderr) + m.ResetString(Input) + found, err := m.Find() + require.NoError(t, err) + t.Logf("match = %v", found) +} diff --git a/go/mysql/icuregex/internal/bytestrie/bytes_trie.go b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go new file mode 100644 index 00000000000..aff80dc3e69 --- /dev/null +++ b/go/mysql/icuregex/internal/bytestrie/bytes_trie.go @@ -0,0 +1,354 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package bytestrie + +type BytesTrie struct { + pos []byte + original []byte + remainingMatchLength int32 +} + +func New(pos []byte) BytesTrie { + return BytesTrie{pos: pos, original: pos, remainingMatchLength: -1} +} + +type result int32 + +const ( /** + * The input unit(s) did not continue a matching string. + * Once current()/next() return NO_MATCH, + * all further calls to current()/next() will also return NO_MATCH, + * until the trie is reset to its original state or to a saved state. + * @stable ICU 4.8 + */ + noMatch result = iota + /** + * The input unit(s) continued a matching string + * but there is no value for the string so far. + * (It is a prefix of a longer string.) + * @stable ICU 4.8 + */ + noValue + /** + * The input unit(s) continued a matching string + * and there is a value for the string so far. + * This value will be returned by getValue(). + * No further input byte/unit can continue a matching string. + * @stable ICU 4.8 + */ + finalValue + /** + * The input unit(s) continued a matching string + * and there is a value for the string so far. + * This value will be returned by getValue(). + * Another input byte/unit can continue a matching string. + * @stable ICU 4.8 + */ + intermediateValue +) + +const ( + maxBranchLinearSubNodeLength = 5 + + // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. + minLinearMatch = 0x10 + maxLinearMatchLength = 0x10 + + // 20..ff: Variable-length value node. + // If odd, the value is final. (Otherwise, intermediate value or jump delta.) + // Then shift-right by 1 bit. + // The remaining lead byte value indicates the number of following bytes (0..4) + // and contains the value's top bits. + minValueLead = minLinearMatch + maxLinearMatchLength // 0x20 + // It is a final value if bit 0 is set. + valueIsFinal = 1 + + // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. + minOneByteValueLead = minValueLead / 2 // 0x10 + maxOneByteValue = 0x40 // At least 6 bits in the first byte. + + minTwoByteValueLead = minOneByteValueLead + maxOneByteValue + 1 // 0x51 + maxTwoByteValue = 0x1aff + minThreeByteValueLead = minTwoByteValueLead + (maxTwoByteValue >> 8) + 1 // 0x6c + fourByteValueLead = 0x7e + + // Compact delta integers. + maxOneByteDelta = 0xbf + minTwoByteDeltaLead = maxOneByteDelta + 1 // 0xc0 + minThreeByteDeltaLead = 0xf0 + fourByteDeltaLead = 0xfe +) + +func (bt *BytesTrie) ContainsName(name string) bool { + result := noValue + for _, c := range []byte(name) { + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + if c == 0x2d || c == 0x5f || c == 0x20 || (0x09 <= c && c <= 0x0d) { + continue + } + if result&1 == 0 { + return false + } + result = bt.next(int32(c)) + } + return result >= finalValue +} + +func (bt *BytesTrie) next(inByte int32) result { + pos := bt.pos + if pos == nil { + return noMatch + } + if inByte < 0 { + inByte += 0x100 + } + length := bt.remainingMatchLength // Actual remaining match length minus 1. + if length >= 0 { + match := inByte == int32(pos[0]) + pos = pos[1:] + // Remaining part of a linear-match node. + if match { + length = length - 1 + bt.remainingMatchLength = length + bt.pos = pos + if length < 0 { + node := int32(pos[0]) + if node >= minValueLead { + return bt.valueResult(node) + } + } + return noValue + } + bt.stop() + return noMatch + } + return bt.nextImpl(pos, inByte) +} + +func (bt *BytesTrie) nextImpl(pos []byte, inByte int32) result { + for { + node := int32(pos[0]) + pos = pos[1:] + if node < minLinearMatch { + return bt.branchNext(pos, node, inByte) + } else if node < minValueLead { + // Match the first of length+1 bytes. + length := node - minLinearMatch // Actual match length minus 1. + match := inByte == int32(pos[0]) + pos = pos[1:] + if match { + length = length - 1 + bt.remainingMatchLength = length + bt.pos = pos + if length < 0 { + node = int32(pos[0]) + if node >= minValueLead { + return bt.valueResult(node) + } + } + return noValue + } + // No match. + break + } else if (node & valueIsFinal) != 0 { + // No further matching bytes. + break + } else { + // Skip intermediate value. + pos = bt.skipValue2(pos, node) + // The next node must not also be a value node. + } + } + bt.stop() + return noMatch +} + +func (bt *BytesTrie) stop() { + bt.pos = nil +} + +func (bt *BytesTrie) valueResult(node int32) result { + return intermediateValue - result(node&valueIsFinal) +} + +func (bt *BytesTrie) branchNext(pos []byte, length int32, inByte int32) result { + // Branch according to the current unit. + if length == 0 { + length = int32(pos[0]) + pos = pos[1:] + } + length++ + // The length of the branch is the number of units to select from. + // The data structure encodes a binary search. + for length > maxBranchLinearSubNodeLength { + p := int32(pos[0]) + pos = pos[1:] + if inByte < p { + length >>= 1 + pos = bt.jumpByDelta(pos) + } else { + length = length - (length >> 1) + pos = bt.skipDelta(pos) + } + } + // Drop down to linear search for the last few bytes. + // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 + // and divides length by 2. + for { + p := int32(pos[0]) + pos = pos[1:] + if inByte == p { + var result result + node := int32(pos[0]) + if (node & valueIsFinal) != 0 { + // Leave the final value for getValue() to read. + result = finalValue + } else { + // Use the non-final value as the jump delta. + pos = pos[1:] + // int32_t delta=readValue(pos, node>>1); + node >>= 1 + var delta int32 + if node < minTwoByteValueLead { + delta = node - minOneByteValueLead + } else if node < minThreeByteValueLead { + delta = ((node - minTwoByteValueLead) << 8) | int32(pos[0]) + pos = pos[1:] + } else if node < fourByteValueLead { + delta = ((node - minThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + pos = pos[2:] + } else if node == fourByteValueLead { + delta = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) + pos = pos[3:] + } else { + delta = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) + pos = pos[4:] + } + // end readValue() + pos = pos[delta:] + node = int32(pos[0]) + if node >= minValueLead { + result = bt.valueResult(node) + } else { + result = noValue + } + } + bt.pos = pos + return result + } + length-- + pos = bt.skipValue1(pos) + if length <= 1 { + break + } + } + p := int32(pos[0]) + pos = pos[1:] + if inByte == p { + bt.pos = pos + node := int32(pos[0]) + if node >= minValueLead { + return bt.valueResult(node) + } + return noValue + } + bt.stop() + return noMatch +} + +func (bt *BytesTrie) skipValue1(pos []byte) []byte { + leadByte := int32(pos[0]) + return bt.skipValue2(pos[1:], leadByte) +} + +func (bt *BytesTrie) skipValue2(pos []byte, leadByte int32) []byte { + if leadByte >= (minTwoByteValueLead << 1) { + if leadByte < (minThreeByteValueLead << 1) { + pos = pos[1:] + } else if leadByte < (fourByteValueLead << 1) { + pos = pos[2:] + } else { + pos = pos[3+((leadByte>>1)&1):] + } + } + return pos +} + +func (bt *BytesTrie) skipDelta(pos []byte) []byte { + delta := int32(pos[0]) + pos = pos[1:] + if delta >= minTwoByteDeltaLead { + if delta < minThreeByteDeltaLead { + pos = pos[1:] + } else if delta < fourByteDeltaLead { + pos = pos[2:] + } else { + pos = pos[3+(delta&1):] + } + } + return pos +} + +func (bt *BytesTrie) jumpByDelta(pos []byte) []byte { + delta := int32(pos[0]) + pos = pos[1:] + if delta < minTwoByteDeltaLead { + // nothing to do + } else if delta < minThreeByteDeltaLead { + delta = ((delta - minTwoByteDeltaLead) << 8) | int32(pos[0]) + pos = pos[1:] + } else if delta < fourByteDeltaLead { + delta = ((delta - minThreeByteDeltaLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + pos = pos[2:] + } else if delta == fourByteDeltaLead { + delta = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) + pos = pos[3:] + } else { + delta = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) + pos = pos[4:] + } + return pos[delta:] +} + +func (bt *BytesTrie) GetValue() int32 { + pos := bt.pos + leadByte := int32(pos[0]) + return bt.readValue(pos[1:], leadByte>>1) +} + +func (bt *BytesTrie) readValue(pos []byte, leadByte int32) int32 { + var value int32 + if leadByte < minTwoByteValueLead { + value = leadByte - minOneByteValueLead + } else if leadByte < minThreeByteValueLead { + value = ((leadByte - minTwoByteValueLead) << 8) | int32(pos[0]) + } else if leadByte < fourByteValueLead { + value = ((leadByte - minThreeByteValueLead) << 16) | (int32(pos[0]) << 8) | int32(pos[1]) + } else if leadByte == fourByteValueLead { + value = (int32(pos[0]) << 16) | (int32(pos[1]) << 8) | int32(pos[2]) + } else { + value = (int32(pos[0]) << 24) | (int32(pos[1]) << 16) | (int32(pos[2]) << 8) | int32(pos[3]) + } + return value +} diff --git a/go/mysql/icuregex/internal/icudata/README.md b/go/mysql/icuregex/internal/icudata/README.md new file mode 100644 index 00000000000..070633b555e --- /dev/null +++ b/go/mysql/icuregex/internal/icudata/README.md @@ -0,0 +1,46 @@ +# ICU data files + +These are files copied from the ICU project that contain various types +of data, like character properties. + +## How to update + +Not all data files are immediately available in the source code, but +need to be built first. This applies to the character / word break +tables. + +### Copy from source data + +The `icu4c/source/data/in` directory in the source distribution contains +the following ICU data files we use: + +``` +pnames.icu +ubidi.icu +ucase.icu +unames.icu +ulayout.icu +uprops.icu +nfc.nrm +nfkc.nrm +nfkc_cf.nrm +``` + +The character and word break table need to be compiled before they can +be copied. + +In `icu4c/source` run: + +```bash +./configure --with-data-packaging=files +make +``` + +This will compile the character and word break data into a binary file +that we can use. Once built, the following files we use are available in +`icu4c/source/data/out/build/icudtl/brkitr`: + +``` +char.brk +word.brk +``` diff --git a/go/mysql/icuregex/internal/icudata/char.brk b/go/mysql/icuregex/internal/icudata/char.brk new file mode 100644 index 00000000000..a243ae6580a Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/char.brk differ diff --git a/go/mysql/icuregex/internal/icudata/embed.go b/go/mysql/icuregex/internal/icudata/embed.go new file mode 100644 index 00000000000..2b7e3033a21 --- /dev/null +++ b/go/mysql/icuregex/internal/icudata/embed.go @@ -0,0 +1,96 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icudata + +import _ "embed" + +// PNames is the list of property names. It is used +// for example by usage of Unicode propery name aliases +// in regular expressions. +// +//go:embed pnames.icu +var PNames []byte + +// UBidi is the list of bidi properties. These are used +// by Bidi class aliases in regular expressions. +// +//go:embed ubidi.icu +var UBidi []byte + +// UCase is the list of case properties. These are used +// for case folding internally for case insensitive matching. +// +//go:embed ucase.icu +var UCase []byte + +// ULayout is used for property checks agains the InPC, InSC +// and VO properties. +// +//go:embed ulayout.icu +var ULayout []byte + +// UNames is used for named character references in regular +// expressions. +// +//go:embed unames.icu +var UNames []byte + +// UProps is used for all the character properties. These +// are used to retrieve properties of characters for character +// classes, like letters, whitespace, digits etc. +// +//go:embed uprops.icu +var UProps []byte + +// Nfc is the table for character normalization where canonical +// decomposition is done followed by canonical composition. +// This is used for property checks of characters about composition. +// +//go:embed nfc.nrm +var Nfc []byte + +// Nfkc is the table for character normalization where compatibility +// decomposition is done followed by canonical composition. +// This is used for property checks of characters about composition. +// +//go:embed nfkc.nrm +var Nfkc []byte + +// NfkcCf is the table for character normalization where compatibility +// decomposition is done followed by canonical composition with +// case folding. +// This is used for property checks of characters about composition. +// +//go:embed nfkc_cf.nrm +var NfkcCf []byte + +// BrkChar is used for matching against character break +// characters in regular expressions. +// +//go:embed char.brk +var BrkChar []byte + +// BrkWord is used for matching against word break +// characters in regular expressions. +// +//go:embed word.brk +var BrkWord []byte diff --git a/go/mysql/icuregex/internal/icudata/nfc.nrm b/go/mysql/icuregex/internal/icudata/nfc.nrm new file mode 100644 index 00000000000..a1254c0aa75 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/nfc.nrm differ diff --git a/go/mysql/icuregex/internal/icudata/nfkc.nrm b/go/mysql/icuregex/internal/icudata/nfkc.nrm new file mode 100644 index 00000000000..2e6e3dda074 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/nfkc.nrm differ diff --git a/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm b/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm new file mode 100644 index 00000000000..a3a40833a91 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/nfkc_cf.nrm differ diff --git a/go/mysql/icuregex/internal/icudata/pnames.icu b/go/mysql/icuregex/internal/icudata/pnames.icu new file mode 100644 index 00000000000..58af6c0157a Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/pnames.icu differ diff --git a/go/mysql/icuregex/internal/icudata/ubidi.icu b/go/mysql/icuregex/internal/icudata/ubidi.icu new file mode 100644 index 00000000000..bc85f3d3502 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/ubidi.icu differ diff --git a/go/mysql/icuregex/internal/icudata/ucase.icu b/go/mysql/icuregex/internal/icudata/ucase.icu new file mode 100644 index 00000000000..011e6053f79 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/ucase.icu differ diff --git a/go/mysql/icuregex/internal/icudata/ulayout.icu b/go/mysql/icuregex/internal/icudata/ulayout.icu new file mode 100644 index 00000000000..598d347cc1e Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/ulayout.icu differ diff --git a/go/mysql/icuregex/internal/icudata/unames.icu b/go/mysql/icuregex/internal/icudata/unames.icu new file mode 100644 index 00000000000..55a2267fd5b Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/unames.icu differ diff --git a/go/mysql/icuregex/internal/icudata/uprops.icu b/go/mysql/icuregex/internal/icudata/uprops.icu new file mode 100644 index 00000000000..245db9a0584 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/uprops.icu differ diff --git a/go/mysql/icuregex/internal/icudata/word.brk b/go/mysql/icuregex/internal/icudata/word.brk new file mode 100644 index 00000000000..80460c60128 Binary files /dev/null and b/go/mysql/icuregex/internal/icudata/word.brk differ diff --git a/go/mysql/icuregex/internal/normalizer/constants.go b/go/mysql/icuregex/internal/normalizer/constants.go new file mode 100644 index 00000000000..3c2de588952 --- /dev/null +++ b/go/mysql/icuregex/internal/normalizer/constants.go @@ -0,0 +1,122 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package normalizer + +const ( + // Fixed norm16 values. + minYesYesWithCC = 0xfe02 + jamoVt = 0xfe00 + minNormalMaybeYes = 0xfc00 + jamoL = 2 // offset=1 hasCompBoundaryAfter=false + inert = 1 // offset=0 hasCompBoundaryAfter=true + + // norm16 bit 0 is comp-boundary-after. + hasCompBoundaryAfter = 1 + offsetShift = 1 + + // For algorithmic one-way mappings, norm16 bits 2..1 indicate the + // tccc (0, 1, >1) for quick FCC boundary-after tests. + deltaTccc0 = 0 + deltaTccc1 = 2 + deltaTcccGt1 = 4 + deltaTcccMask = 6 + deltaShift = 3 + + maxDelta = 0x40 +) + +const ( + jamoLBase rune = 0x1100 /* "lead" jamo */ + jamoLEnd rune = 0x1112 + jamoVBase rune = 0x1161 /* "vowel" jamo */ + jamoVEnd rune = 0x1175 + jamoTBase rune = 0x11a7 /* "trail" jamo */ + jamoTEnd rune = 0x11c2 + + hangulBase rune = 0xac00 + hangulEnd rune = 0xd7a3 + + jamoLCount rune = 19 + jamoVCount rune = 21 + jamoTCount rune = 28 + + hangulCount = jamoLCount * jamoVCount * jamoTCount + hangulLimit = hangulBase + hangulCount +) + +const ( + mappingHasCccLcccWord = 0x80 + mappingHasRawMapping = 0x40 + // unused bit 0x20, + mappingLengthMask = 0x1f +) + +/** + * Constants for normalization modes. + * @deprecated ICU 56 Use unorm2.h instead. + */ +type Mode int32 + +const ( + /** No decomposition/composition. @deprecated ICU 56 Use unorm2.h instead. */ + NormNone Mode = 1 + /** Canonical decomposition. @deprecated ICU 56 Use unorm2.h instead. */ + NormNfd Mode = 2 + /** Compatibility decomposition. @deprecated ICU 56 Use unorm2.h instead. */ + NormNfkd Mode = 3 + /** Canonical decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ + NormNfc Mode = 4 + /** Default normalization. @deprecated ICU 56 Use unorm2.h instead. */ + NormDefault Mode = NormNfc + /** Compatibility decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */ + NormNfkc Mode = 5 + /** "Fast C or D" form. @deprecated ICU 56 Use unorm2.h instead. */ + NormFcd Mode = 6 +) + +/** + * Result values for normalization quick check functions. + * For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + * @stable ICU 2.0 + */ +type CheckResult int + +const ( + /** + * The input string is not in the normalization form. + * @stable ICU 2.0 + */ + No CheckResult = iota + /** + * The input string is in the normalization form. + * @stable ICU 2.0 + */ + Yes + /** + * The input string may or may not be in the normalization form. + * This value is only returned for composition forms like NFC and FCC, + * when a backward-combining character is found for which the surrounding text + * would have to be analyzed further. + * @stable ICU 2.0 + */ + Maybe +) diff --git a/go/mysql/icuregex/internal/normalizer/normalizer.go b/go/mysql/icuregex/internal/normalizer/normalizer.go new file mode 100644 index 00000000000..c13a4878deb --- /dev/null +++ b/go/mysql/icuregex/internal/normalizer/normalizer.go @@ -0,0 +1,482 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package normalizer + +import ( + "errors" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/uset" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +type Normalizer struct { + minDecompNoCP rune + minCompNoMaybeCP rune + minLcccCP rune + + // Norm16 value thresholds for quick check combinations and types of extra data. + minYesNo uint16 + minYesNoMappingsOnly uint16 + minNoNo uint16 + minNoNoCompBoundaryBefore uint16 + minNoNoCompNoMaybeCC uint16 + minNoNoEmpty uint16 + limitNoNo uint16 + centerNoNoDelta uint16 + minMaybeYes uint16 + + normTrie *utrie.UcpTrie + + maybeYesCompositions []uint16 + extraData []uint16 // mappings and/or compositions for yesYes, yesNo & noNo characters + smallFCD []uint8 // [0x100] one bit per 32 BMP code points, set if any FCD!=0 +} + +var nfc *Normalizer +var nfkc *Normalizer + +var normalizerOnce sync.Once + +func loadNormalizer() { + normalizerOnce.Do(func() { + nfc = &Normalizer{} + if err := nfc.load(icudata.Nfc); err != nil { + panic(err) + } + + nfkc = &Normalizer{} + if err := nfkc.load(icudata.Nfkc); err != nil { + panic(err) + } + }) +} + +const ixNormTrieOffset = 0 +const ixExtraDataOffset = 1 +const ixSmallFcdOffset = 2 +const ixReserved3Offset = 3 +const ixTotalSize = 7 + +const ixMinDecompNoCp = 8 +const ixMinCompNoMaybeCp = 9 + +/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ +const ixMinYesNo = 10 + +/** Mappings are comp-normalized. */ +const ixMinNoNo = 11 +const ixLimitNoNo = 12 +const ixMinMaybeYes = 13 + +/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ +const ixMinYesNoMappingsOnly = 14 + +/** Mappings are not comp-normalized but have a comp boundary before. */ +const ixMinNoNoCompBoundaryBefore = 15 + +/** Mappings do not have a comp boundary before. */ +const ixMinNoNoCompNoMaybeCc = 16 + +/** Mappings to the empty string. */ +const ixMinNoNoEmpty = 17 + +const ixMinLcccCp = 18 +const ixCount = 20 + +func (n *Normalizer) load(data []byte) error { + bytes := udata.NewBytes(data) + + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.Size >= 20 && + info.IsBigEndian == 0 && + info.CharsetFamily == 0 && + info.DataFormat[0] == 0x4e && /* dataFormat="unam" */ + info.DataFormat[1] == 0x72 && + info.DataFormat[2] == 0x6d && + info.DataFormat[3] == 0x32 && + info.FormatVersion[0] == 4 + }) + if err != nil { + return err + } + + indexesLength := int32(bytes.Uint32()) / 4 + if indexesLength <= ixMinLcccCp { + return errors.New("normalizer2 data: not enough indexes") + } + indexes := make([]int32, indexesLength) + indexes[0] = indexesLength * 4 + for i := int32(1); i < indexesLength; i++ { + indexes[i] = bytes.Int32() + } + + n.minDecompNoCP = indexes[ixMinDecompNoCp] + n.minCompNoMaybeCP = indexes[ixMinCompNoMaybeCp] + n.minLcccCP = indexes[ixMinLcccCp] + + n.minYesNo = uint16(indexes[ixMinYesNo]) + n.minYesNoMappingsOnly = uint16(indexes[ixMinYesNoMappingsOnly]) + n.minNoNo = uint16(indexes[ixMinNoNo]) + n.minNoNoCompBoundaryBefore = uint16(indexes[ixMinNoNoCompBoundaryBefore]) + n.minNoNoCompNoMaybeCC = uint16(indexes[ixMinNoNoCompNoMaybeCc]) + n.minNoNoEmpty = uint16(indexes[ixMinNoNoEmpty]) + n.limitNoNo = uint16(indexes[ixLimitNoNo]) + n.minMaybeYes = uint16(indexes[ixMinMaybeYes]) + + n.centerNoNoDelta = uint16(indexes[ixMinMaybeYes]>>deltaShift) - maxDelta - 1 + + offset := indexes[ixNormTrieOffset] + nextOffset := indexes[ixExtraDataOffset] + triePosition := bytes.Position() + + n.normTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + + trieLength := bytes.Position() - triePosition + if trieLength > nextOffset-offset { + return errors.New("normalizer2 data: not enough bytes for normTrie") + } + bytes.Skip((nextOffset - offset) - trieLength) // skip padding after trie bytes + + // Read the composition and mapping data. + offset = nextOffset + nextOffset = indexes[ixSmallFcdOffset] + numChars := (nextOffset - offset) / 2 + if numChars != 0 { + n.maybeYesCompositions = bytes.Uint16Slice(numChars) + n.extraData = n.maybeYesCompositions[((minNormalMaybeYes - n.minMaybeYes) >> offsetShift):] + } + + // smallFCD: new in formatVersion 2 + n.smallFCD = bytes.Uint8Slice(0x100) + return nil +} + +func Nfc() *Normalizer { + loadNormalizer() + return nfc +} + +func Nfkc() *Normalizer { + loadNormalizer() + return nfkc +} + +func (n *Normalizer) AddPropertyStarts(u *uset.UnicodeSet) { + var start, end rune + var value uint32 + for { + end, value = nfc.normTrie.GetRange(start, utrie.UcpMapRangeFixedLeadSurrogates, inert, nil) + if end < 0 { + break + } + u.AddRune(start) + if start != end && n.isAlgorithmicNoNo(uint16(value)) && (value&deltaTcccMask) > deltaTccc1 { + // Range of code points with same-norm16-value algorithmic decompositions. + // They might have different non-zero FCD16 values. + prevFCD16 := n.GetFCD16(start) + for { + start++ + if start > end { + break + } + fcd16 := n.GetFCD16(start) + if fcd16 != prevFCD16 { + u.AddRune(start) + prevFCD16 = fcd16 + } + } + } + start = end + 1 + } + + // add Hangul LV syllables and LV+1 because of skippables + for c := hangulBase; c < hangulLimit; c += jamoTCount { + u.AddRune(c) + u.AddRune(c + 1) + } + u.AddRune(hangulLimit) +} + +func (n *Normalizer) isAlgorithmicNoNo(norm16 uint16) bool { + return n.limitNoNo <= norm16 && norm16 < n.minMaybeYes +} + +func (n *Normalizer) GetFCD16(c rune) uint16 { + if c < n.minDecompNoCP { + return 0 + } else if c <= 0xffff { + if !n.singleLeadMightHaveNonZeroFCD16(c) { + return 0 + } + } + return n.getFCD16FromNormData(c) +} + +func (n *Normalizer) singleLeadMightHaveNonZeroFCD16(lead rune) bool { + // 0<=lead<=0xffff + bits := n.smallFCD[lead>>8] + if bits == 0 { + return false + } + return ((bits >> ((lead >> 5) & 7)) & 1) != 0 +} + +func (n *Normalizer) getFCD16FromNormData(c rune) uint16 { + norm16 := n.getNorm16(c) + if norm16 >= n.limitNoNo { + if norm16 >= minNormalMaybeYes { + // combining mark + norm16 = uint16(n.getCCFromNormalYesOrMaybe(norm16)) + return norm16 | (norm16 << 8) + } else if norm16 >= n.minMaybeYes { + return 0 + } else { // isDecompNoAlgorithmic(norm16) + deltaTrailCC := norm16 & deltaTcccMask + if deltaTrailCC <= deltaTccc1 { + return deltaTrailCC >> offsetShift + } + // Maps to an isCompYesAndZeroCC. + c = n.mapAlgorithmic(c, norm16) + norm16 = n.getRawNorm16(c) + } + } + + if norm16 <= n.minYesNo || n.isHangulLVT(norm16) { + // no decomposition or Hangul syllable, all zeros + return 0 + } + // c decomposes, get everything from the variable-length extra data + mapping := n.getMapping(norm16) + firstUnit := mapping[1] + if firstUnit&mappingHasCccLcccWord != 0 { + norm16 |= mapping[0] & 0xff00 + } + return norm16 +} + +func (n *Normalizer) getMapping(norm16 uint16) []uint16 { + return n.extraData[(norm16>>offsetShift)-1:] +} + +func (n *Normalizer) getNorm16(c rune) uint16 { + if utf16.IsLead(c) { + return inert + } + return n.getRawNorm16(c) +} + +func (n *Normalizer) getRawNorm16(c rune) uint16 { + return uint16(n.normTrie.Get(c)) +} + +func (n *Normalizer) getCCFromNormalYesOrMaybe(norm16 uint16) uint8 { + return uint8(norm16 >> offsetShift) +} + +func (n *Normalizer) mapAlgorithmic(c rune, norm16 uint16) rune { + return c + rune(norm16>>deltaShift) - rune(n.centerNoNoDelta) +} + +func (n *Normalizer) isHangulLV(norm16 uint16) bool { + return norm16 == n.minYesNo +} + +func (n *Normalizer) isHangulLVT(norm16 uint16) bool { + return norm16 == n.hangulLVT() +} + +func (n *Normalizer) hangulLVT() uint16 { + return n.minYesNoMappingsOnly | hasCompBoundaryAfter +} + +func (n *Normalizer) getComposeQuickCheck(c rune) CheckResult { + return n.getCompQuickCheck(n.getNorm16(c)) +} + +func (n *Normalizer) getDecomposeQuickCheck(c rune) CheckResult { + if n.isDecompYes(n.getNorm16(c)) { + return Yes + } + return No +} + +func QuickCheck(c rune, mode Mode) CheckResult { + if mode <= NormNone || NormFcd <= mode { + return Yes + } + switch mode { + case NormNfc: + return Nfc().getComposeQuickCheck(c) + case NormNfd: + return Nfc().getDecomposeQuickCheck(c) + case NormNfkc: + return Nfkc().getComposeQuickCheck(c) + case NormNfkd: + return Nfkc().getDecomposeQuickCheck(c) + default: + return Maybe + } +} + +func IsInert(c rune, mode Mode) bool { + switch mode { + case NormNfc: + return Nfc().isCompInert(c) + case NormNfd: + return Nfc().isDecompInert(c) + case NormNfkc: + return Nfkc().isCompInert(c) + case NormNfkd: + return Nfkc().isDecompInert(c) + default: + return true + } +} + +func (n *Normalizer) isDecompYes(norm16 uint16) bool { + return norm16 < n.minYesNo || n.minMaybeYes <= norm16 +} + +func (n *Normalizer) getCompQuickCheck(norm16 uint16) CheckResult { + if norm16 < n.minNoNo || minYesYesWithCC <= norm16 { + return Yes + } else if n.minMaybeYes <= norm16 { + return Maybe + } else { + return No + } +} + +func (n *Normalizer) isMaybeOrNonZeroCC(norm16 uint16) bool { + return norm16 >= n.minMaybeYes +} + +func (n *Normalizer) isDecompNoAlgorithmic(norm16 uint16) bool { + return norm16 >= n.limitNoNo +} + +func (n *Normalizer) IsCompNo(c rune) bool { + norm16 := n.getNorm16(c) + return n.minNoNo <= norm16 && norm16 < n.minMaybeYes +} + +func (n *Normalizer) Decompose(c rune) []rune { + norm16 := n.getNorm16(c) + if c < n.minDecompNoCP || n.isMaybeOrNonZeroCC(norm16) { + // c does not decompose + return nil + } + var decomp []rune + + if n.isDecompNoAlgorithmic(norm16) { + // Maps to an isCompYesAndZeroCC. + c = n.mapAlgorithmic(c, norm16) + decomp = append(decomp, c) + // The mapping might decompose further. + norm16 = n.getRawNorm16(c) + } + if norm16 < n.minYesNo { + return decomp + } else if n.isHangulLV(norm16) || n.isHangulLVT(norm16) { + // Hangul syllable: decompose algorithmically + parts := hangulDecompose(c) + for len(parts) > 0 { + c = rune(parts[0]) + decomp = append(decomp, c) + parts = parts[1:] + } + return decomp + } + // c decomposes, get everything from the variable-length extra data + mapping := n.getMapping(norm16) + length := mapping[1] & mappingLengthMask + mapping = mapping[2 : 2+length] + + for len(mapping) > 0 { + c, mapping = utf16.NextUnsafe(mapping) + decomp = append(decomp, c) + } + + return decomp +} + +func hangulDecompose(c rune) []uint16 { + c -= hangulBase + c2 := c % jamoTCount + c /= jamoTCount + var buffer []uint16 + buffer = append(buffer, uint16(jamoLBase+c/jamoVCount)) + buffer = append(buffer, uint16(jamoVBase+c%jamoVCount)) + if c2 != 0 { + buffer = append(buffer, uint16(jamoTBase+c2)) + } + return buffer +} + +func (n *Normalizer) isCompInert(c rune) bool { + norm16 := n.getNorm16(c) + return n.isCompYesAndZeroCC(norm16) && (norm16&hasCompBoundaryAfter) != 0 +} + +func (n *Normalizer) isDecompInert(c rune) bool { + return n.isDecompYesAndZeroCC(n.getNorm16(c)) +} + +func (n *Normalizer) isCompYesAndZeroCC(norm16 uint16) bool { + return norm16 < n.minNoNo +} + +func (n *Normalizer) isDecompYesAndZeroCC(norm16 uint16) bool { + return norm16 < n.minYesNo || + norm16 == jamoVt || + (n.minMaybeYes <= norm16 && norm16 <= minNormalMaybeYes) +} + +func (n *Normalizer) CombiningClass(c rune) uint8 { + return n.getCC(n.getNorm16(c)) +} + +func (n *Normalizer) getCC(norm16 uint16) uint8 { + if norm16 >= minNormalMaybeYes { + return n.getCCFromNormalYesOrMaybe(norm16) + } + if norm16 < n.minNoNo || n.limitNoNo <= norm16 { + return 0 + } + return n.getCCFromNoNo(norm16) + +} + +func (n *Normalizer) getCCFromNoNo(norm16 uint16) uint8 { + mapping := n.getMapping(norm16) + if mapping[1]&mappingHasCccLcccWord != 0 { + return uint8(mapping[0]) + } + return 0 +} diff --git a/go/mysql/icuregex/internal/pattern/unescape.go b/go/mysql/icuregex/internal/pattern/unescape.go new file mode 100644 index 00000000000..e4a554ff612 --- /dev/null +++ b/go/mysql/icuregex/internal/pattern/unescape.go @@ -0,0 +1,314 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pattern + +import ( + "strings" + "unicode/utf8" + + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +/* Convert one octal digit to a numeric value 0..7, or -1 on failure */ +func _digit8(c rune) rune { + if c >= 0x0030 && c <= 0x0037 { + return (c - 0x0030) + } + return -1 +} + +/* Convert one hex digit to a numeric value 0..F, or -1 on failure */ +func _digit16(c rune) rune { + if c >= 0x0030 && c <= 0x0039 { + return (c - 0x0030) + } + if c >= 0x0041 && c <= 0x0046 { + return (c - (0x0041 - 10)) + } + if c >= 0x0061 && c <= 0x0066 { + return (c - (0x0061 - 10)) + } + return -1 +} + +var unscapeMap = []byte{ + /*" 0x22, 0x22 */ + /*' 0x27, 0x27 */ + /*? 0x3F, 0x3F */ + /*\ 0x5C, 0x5C */ + /*a*/ 0x61, 0x07, + /*b*/ 0x62, 0x08, + /*e*/ 0x65, 0x1b, + /*f*/ 0x66, 0x0c, + /*n*/ 0x6E, 0x0a, + /*r*/ 0x72, 0x0d, + /*t*/ 0x74, 0x09, + /*v*/ 0x76, 0x0b, +} + +func Unescape(str string) (string, bool) { + var idx int + if idx = strings.IndexByte(str, '\\'); idx < 0 { + return str, true + } + + var result strings.Builder + result.WriteString(str[:idx]) + str = str[idx:] + + for len(str) > 0 { + if str[0] == '\\' { + var r rune + r, str = UnescapeAt(str[1:]) + if r < 0 { + return "", false + } + result.WriteRune(r) + } else { + result.WriteByte(str[0]) + str = str[1:] + } + } + return result.String(), true +} + +func UnescapeAt(str string) (rune, string) { + c, w := utf8.DecodeRuneInString(str) + str = str[w:] + if c == utf8.RuneError && (w == 0 || w == 1) { + return -1, str + } + + var minDig, maxDig, n int + var braces bool + var bitsPerDigit = 4 + var result rune + + switch c { + case 'u': + minDig = 4 + maxDig = 4 + case 'U': + minDig = 8 + maxDig = 8 + case 'x': + minDig = 1 + if len(str) > 0 && str[0] == '{' { + str = str[1:] + braces = true + maxDig = 8 + } else { + maxDig = 2 + } + default: + if dig := _digit8(c); dig >= 0 { + minDig = 1 + maxDig = 4 + n = 1 + bitsPerDigit = 3 + result = dig + } + } + + if minDig != 0 { + for n < maxDig && len(str) > 0 { + c, w = utf8.DecodeRuneInString(str) + if c == utf8.RuneError && w == 1 { + return -1, str + } + + var dig rune + if bitsPerDigit == 3 { + dig = _digit8(c) + } else { + dig = _digit16(c) + } + if dig < 0 { + break + } + result = (result << bitsPerDigit) | dig + str = str[w:] + n++ + } + if n < minDig { + return -1, str + } + if braces { + if c != '}' { + return -1, str + } + str = str[1:] + } + if result < 0 || result > utf8.MaxRune { + return -1, str + } + if len(str) > 0 && utf16.IsLead(result) { + c, w = utf8.DecodeRuneInString(str) + if c == utf8.RuneError && (w == 0 || w == 1) { + return -1, str + } + if c == '\\' { + var str2 string + c, str2 = UnescapeAt(str[1:]) + if utf16.IsTrail(c) { + result = utf16.DecodeRune(result, c) + str = str2 + } + } + } + return result, str + } + + if c < utf8.RuneSelf { + for i := 0; i < len(unscapeMap); i += 2 { + if byte(c) == unscapeMap[i] { + return rune(unscapeMap[i+1]), str + } + if byte(c) < unscapeMap[i] { + break + } + } + } + + if c == 'c' && len(str) > 0 { + c, w = utf8.DecodeRuneInString(str) + if c == utf8.RuneError && (w == 0 || w == 1) { + return -1, str + } + return 0x1f & c, str[w:] + } + + return c, str +} + +func UnescapeAtRunes(str []rune) (rune, []rune) { + if len(str) == 0 { + return -1, str + } + + c := str[0] + str = str[1:] + if c == utf8.RuneError { + return -1, str + } + + var minDig, maxDig, n int + var braces bool + var bitsPerDigit = 4 + var result rune + + switch c { + case 'u': + minDig = 4 + maxDig = 4 + case 'U': + minDig = 8 + maxDig = 8 + case 'x': + minDig = 1 + if len(str) > 0 && str[0] == '{' { + str = str[1:] + braces = true + maxDig = 8 + } else { + maxDig = 2 + } + default: + if dig := _digit8(c); dig >= 0 { + minDig = 1 + maxDig = 4 + n = 1 + bitsPerDigit = 3 + result = dig + } + } + + if minDig != 0 { + for n < maxDig && len(str) > 0 { + c = str[0] + if c == utf8.RuneError { + return -1, str + } + + var dig rune + if bitsPerDigit == 3 { + dig = _digit8(c) + } else { + dig = _digit16(c) + } + if dig < 0 { + break + } + result = (result << bitsPerDigit) | dig + str = str[1:] + n++ + } + if n < minDig { + return -1, str + } + if braces { + if c != '}' { + return -1, str + } + str = str[1:] + } + if result < 0 || result > utf8.MaxRune { + return -1, str + } + if len(str) > 0 && utf16.IsLead(result) { + c = str[0] + if c == utf8.RuneError { + return -1, str + } + if c == '\\' { + var str2 []rune + c, str2 = UnescapeAtRunes(str[1:]) + if utf16.IsTrail(c) { + result = utf16.DecodeRune(result, c) + str = str2 + } + } + } + return result, str + } + + if c < utf8.RuneSelf { + for i := 0; i < len(unscapeMap); i += 2 { + if byte(c) == unscapeMap[i] { + return rune(unscapeMap[i+1]), str + } + if byte(c) < unscapeMap[i] { + break + } + } + } + + if c == 'c' && len(str) > 0 { + c = str[0] + if c == utf8.RuneError { + return -1, str + } + return 0x1f & c, str[1:] + } + + return c, str +} diff --git a/go/mysql/icuregex/internal/pattern/unescape_test.go b/go/mysql/icuregex/internal/pattern/unescape_test.go new file mode 100644 index 00000000000..0bb76c2bfdb --- /dev/null +++ b/go/mysql/icuregex/internal/pattern/unescape_test.go @@ -0,0 +1,48 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pattern + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestUnescapeAt(t *testing.T) { + r, str := UnescapeAt("ud800\\ud800\\udc00") + assert.Equal(t, rune(0xd800), r) + assert.Equal(t, "\\ud800\\udc00", str) + + r, str = UnescapeAt(str[1:]) + assert.Equal(t, rune(0x00010000), r) + assert.Equal(t, "", str) +} + +func TestUnescapeAtRunes(t *testing.T) { + r, str := UnescapeAtRunes([]rune("ud800\\ud800\\udc00")) + assert.Equal(t, rune(0xd800), r) + assert.Equal(t, []rune("\\ud800\\udc00"), str) + + r, str = UnescapeAtRunes(str[1:]) + assert.Equal(t, rune(0x00010000), r) + assert.Equal(t, []rune(""), str) +} diff --git a/go/mysql/icuregex/internal/pattern/utils.go b/go/mysql/icuregex/internal/pattern/utils.go new file mode 100644 index 00000000000..4dcf55e9f42 --- /dev/null +++ b/go/mysql/icuregex/internal/pattern/utils.go @@ -0,0 +1,111 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pattern + +import ( + "strings" + "unicode/utf8" +) + +var patternPropsLatin1 = [256]uint8{ + // WS: 9..D + 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // WS: 20 Syntax: 21..2F + 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // Syntax: 3A..40 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: 5B..5E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + // Syntax: 60 + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: 7B..7E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + // WS: 85 + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: A1..A7, A9, AB, AC, AE + 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, + // Syntax: B0, B1, B6, BB, BF + 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: D7 + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Syntax: F7 + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, +} + +func IsWhitespace(c rune) bool { + if c < 0 { + return false + } else if c <= 0xff { + return (patternPropsLatin1[c]>>2)&1 != 0 + } else if 0x200e <= c && c <= 0x2029 { + return c <= 0x200f || 0x2028 <= c + } else { + return false + } +} + +func SkipWhitespace(str string) string { + for { + r, w := utf8.DecodeRuneInString(str) + if r == utf8.RuneError && (w == 0 || w == 1) { + return str[w:] + } + if !IsWhitespace(r) { + return str + } + str = str[w:] + } +} + +func IsUnprintable(c rune) bool { + return !(c >= 0x20 && c <= 0x7E) +} + +// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +var digits = [...]byte{ + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, +} + +func EscapeUnprintable(w *strings.Builder, c rune) { + w.WriteByte('\\') + if (c & ^0xFFFF) != 0 { + w.WriteByte('U') + w.WriteByte(digits[0xF&(c>>28)]) + w.WriteByte(digits[0xF&(c>>24)]) + w.WriteByte(digits[0xF&(c>>20)]) + w.WriteByte(digits[0xF&(c>>16)]) + } else { + w.WriteByte('u') + } + w.WriteByte(digits[0xF&(c>>12)]) + w.WriteByte(digits[0xF&(c>>8)]) + w.WriteByte(digits[0xF&(c>>4)]) + w.WriteByte(digits[0xF&c]) +} diff --git a/go/mysql/icuregex/internal/ubidi/ubidi.go b/go/mysql/icuregex/internal/ubidi/ubidi.go new file mode 100644 index 00000000000..195e2b1a6dd --- /dev/null +++ b/go/mysql/icuregex/internal/ubidi/ubidi.go @@ -0,0 +1,461 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ubidi + +import ( + "errors" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +const ( + ixIndexTop = iota + ixLength + ixTrieSize + ixMirrorLength + + ixJgStart + ixJgLimit + ixJgStart2 /* new in format version 2.2, ICU 54 */ + ixJgLimit2 + + maxValuesIndex + ixTop +) + +var ubidi struct { + indexes []int32 + trie *utrie.UTrie2 + mirrors []uint32 + jg []uint8 + jg2 []uint8 +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x42 && + info.DataFormat[1] == 0x69 && + info.DataFormat[2] == 0x44 && + info.DataFormat[3] == 0x69 && + info.FormatVersion[0] == 2 + }) + if err != nil { + return err + } + + count := int32(bytes.Uint32()) + if count < ixTop { + return errors.New("indexes[0] too small in ucase.icu") + } + + ubidi.indexes = make([]int32, count) + ubidi.indexes[0] = count + + for i := int32(1); i < count; i++ { + ubidi.indexes[i] = int32(bytes.Uint32()) + } + + ubidi.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := ubidi.indexes[ixTrieSize] + trieLength := ubidi.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + + if n := ubidi.indexes[ixMirrorLength]; n > 0 { + ubidi.mirrors = bytes.Uint32Slice(n) + } + if n := ubidi.indexes[ixJgLimit] - ubidi.indexes[ixJgStart]; n > 0 { + ubidi.jg = bytes.Uint8Slice(n) + } + if n := ubidi.indexes[ixJgLimit2] - ubidi.indexes[ixJgStart2]; n > 0 { + ubidi.jg2 = bytes.Uint8Slice(n) + } + + return nil +} + +func init() { + b := udata.NewBytes(icudata.UBidi) + if err := readData(b); err != nil { + panic(err) + } +} + +const ( + /* UBIDI_CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ + jtShift = 5 /* joining type: 3 bits (7..5) */ + + bptShift = 8 /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ + + joinControlShift = 10 + bidiControlShift = 11 + + isMirroredShift = 12 /* 'is mirrored' */ +) + +/** + * Bidi Paired Bracket Type constants. + * + * @see UCHAR_BIDI_PAIRED_BRACKET_TYPE + * @stable ICU 52 + */ +type UPairedBracketType int32 + +/* + * Note: UBidiPairedBracketType constants are parsed by preparseucd.py. + * It matches lines like + * U_BPT_ + */ +const ( + /** Not a paired bracket. @stable ICU 52 */ + BptNone UPairedBracketType = iota + /** Open paired bracket. @stable ICU 52 */ + BptOpen + /** Close paired bracket. @stable ICU 52 */ + BptClose +) + +const classMask = 0x0000001f +const jtMask = 0x000000e0 +const bptMask = 0x00000300 + +/** + * Joining Type constants. + * + * @see UCHAR_JOINING_TYPE + * @stable ICU 2.2 + */ +type JoiningType int32 + +/* + * Note: UJoiningType constants are parsed by preparseucd.py. + * It matches lines like + * U_JT_ + */ +const ( + JtNonJoining JoiningType = iota /*[U]*/ + JtJoinCausing /*[C]*/ + JtDualJoining /*[D]*/ + JtLeftJoining /*[L]*/ + JtRightJoining /*[R]*/ + JtTransparent /*[T]*/ +) + +/** + * Joining Group constants. + * + * @see UCHAR_JOINING_GROUP + * @stable ICU 2.2 + */ +type JoiningGroup int32 + +/* + * Note: UJoiningGroup constants are parsed by preparseucd.py. + * It matches lines like + * U_JG_ + */ +const ( + JgNoJoiningGroup JoiningGroup = iota + JgAin + JgAlaph + JgAlef + JgBeh + JgBeth + JgDal + JgDalathRish + JgE + JgFeh + JgFinalSemkath + JgGaf + JgGamal + JgHah + JgTehMarbutaGoal /**< @stable ICU 4.6 */ + JgHe + JgHeh + JgHehGoal + JgHeth + JgKaf + JgKaph + JgKnottedHeh + JgLam + JgLamadh + JgMeem + JgMim + JgNoon + JgNun + JgPe + JgQaf + JgQaph + JgReh + JgReversedPe + JgSad + JgSadhe + JgSeen + JgSemkath + JgShin + JgSwashKaf + JgSyriacWaw + JgTah + JgTaw + JgTehMarbuta + JgTeth + JgWaw + JgYeh + JgYehBarree + JgYehWithTail + JgYudh + JgYudhHe + JgZain + JgFe /**< @stable ICU 2.6 */ + JgKhaph /**< @stable ICU 2.6 */ + JgZhain /**< @stable ICU 2.6 */ + JgBurushashkiYehBarree /**< @stable ICU 4.0 */ + JgFarsiYeh /**< @stable ICU 4.4 */ + JgNya /**< @stable ICU 4.4 */ + JgRohingyaYeh /**< @stable ICU 49 */ + JgManichaeanAleph /**< @stable ICU 54 */ + JgManichaeanAyin /**< @stable ICU 54 */ + JgManichaeanBeth /**< @stable ICU 54 */ + JgManichaeanDaleth /**< @stable ICU 54 */ + JgManichaeanDhamedh /**< @stable ICU 54 */ + JgManichaeanFive /**< @stable ICU 54 */ + JgManichaeanGimel /**< @stable ICU 54 */ + JgManichaeanHeth /**< @stable ICU 54 */ + JgManichaeanHundred /**< @stable ICU 54 */ + JgManichaeanKaph /**< @stable ICU 54 */ + JgManichaeanLamedh /**< @stable ICU 54 */ + JgManichaeanMem /**< @stable ICU 54 */ + JgManichaeanNun /**< @stable ICU 54 */ + JgManichaeanOne /**< @stable ICU 54 */ + JgManichaeanPe /**< @stable ICU 54 */ + JgManichaeanQoph /**< @stable ICU 54 */ + JgManichaeanResh /**< @stable ICU 54 */ + JgManichaeanSadhe /**< @stable ICU 54 */ + JgManichaeanSamekh /**< @stable ICU 54 */ + JgManichaeanTaw /**< @stable ICU 54 */ + JgManichaeanTen /**< @stable ICU 54 */ + JgManichaeanTeth /**< @stable ICU 54 */ + JgManichaeanThamedh /**< @stable ICU 54 */ + JgManichaeanTwenty /**< @stable ICU 54 */ + JgManichaeanWaw /**< @stable ICU 54 */ + JgManichaeanYodh /**< @stable ICU 54 */ + JgManichaeanZayin /**< @stable ICU 54 */ + JgStraightWaw /**< @stable ICU 54 */ + JgAfricanFeh /**< @stable ICU 58 */ + JgAfricanNoon /**< @stable ICU 58 */ + JgAfricanQaf /**< @stable ICU 58 */ + + JgMalayalamBha /**< @stable ICU 60 */ + JgMalayalamJa /**< @stable ICU 60 */ + JgMalayalamLla /**< @stable ICU 60 */ + JgMalayalamLlla /**< @stable ICU 60 */ + JgMalayalamNga /**< @stable ICU 60 */ + JgMalayalamNna /**< @stable ICU 60 */ + JgMalayalamNnna /**< @stable ICU 60 */ + JgMalayalamNya /**< @stable ICU 60 */ + JgMalayalamRa /**< @stable ICU 60 */ + JgMalayalamSsa /**< @stable ICU 60 */ + JgMalayalamTta /**< @stable ICU 60 */ + + JgHanafiRohingyaKinnaYa /**< @stable ICU 62 */ + JgHanafiRohingyaPa /**< @stable ICU 62 */ + + JgThinYeh /**< @stable ICU 70 */ + JgVerticalTail /**< @stable ICU 70 */ +) + +/** + * This specifies the language directional property of a character set. + * @stable ICU 2.0 + */ +type CharDirection int32 + +/* + * Note: UCharDirection constants and their API comments are parsed by preparseucd.py. + * It matches pairs of lines like + * / ** comment... * / + * U_<[A-Z_]+> = , + */ + +const ( + /** L @stable ICU 2.0 */ + LeftToRight CharDirection = 0 + /** R @stable ICU 2.0 */ + RightToLeft CharDirection = 1 + /** EN @stable ICU 2.0 */ + EuropeanNumber CharDirection = 2 + /** ES @stable ICU 2.0 */ + EuropeanNumberSeparator CharDirection = 3 + /** ET @stable ICU 2.0 */ + EuropeanNumberTerminator CharDirection = 4 + /** AN @stable ICU 2.0 */ + ArabicNumber CharDirection = 5 + /** CS @stable ICU 2.0 */ + CommonNumberSeparator CharDirection = 6 + /** B @stable ICU 2.0 */ + BlockSeparator CharDirection = 7 + /** S @stable ICU 2.0 */ + SegmentSeparator CharDirection = 8 + /** WS @stable ICU 2.0 */ + WhiteSpaceNeutral CharDirection = 9 + /** ON @stable ICU 2.0 */ + OtherNeutral CharDirection = 10 + /** LRE @stable ICU 2.0 */ + LeftToRightEmbedding CharDirection = 11 + /** LRO @stable ICU 2.0 */ + LeftToRightOverride CharDirection = 12 + /** AL @stable ICU 2.0 */ + RightToLeftArabic CharDirection = 13 + /** RLE @stable ICU 2.0 */ + RightToLeftEmbedding CharDirection = 14 + /** RLO @stable ICU 2.0 */ + RightToLeftOverride CharDirection = 15 + /** PDF @stable ICU 2.0 */ + PopDirectionalFormat CharDirection = 16 + /** NSM @stable ICU 2.0 */ + DirNonSpacingMark CharDirection = 17 + /** BN @stable ICU 2.0 */ + BoundaryNeutral CharDirection = 18 + /** FSI @stable ICU 52 */ + StrongIsolate CharDirection = 19 + /** LRI @stable ICU 52 */ + LeftToRightIsolate CharDirection = 20 + /** RLI @stable ICU 52 */ + RightToLeftIsolate CharDirection = 21 + /** PDI @stable ICU 52 */ + PopDirectionalIsolate CharDirection = 22 +) + +type propertySet interface { + AddRune(ch rune) + AddRuneRange(from rune, to rune) +} + +func AddPropertyStarts(sa propertySet) { + /* add the start code point of each same-value range of the trie */ + ubidi.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) + + /* add the code points from the bidi mirroring table */ + length := ubidi.indexes[ixMirrorLength] + for i := int32(0); i < length; i++ { + c := mirrorCodePoint(rune(ubidi.mirrors[i])) + sa.AddRuneRange(c, c+1) + } + + /* add the code points from the Joining_Group array where the value changes */ + start := ubidi.indexes[ixJgStart] + limit := ubidi.indexes[ixJgLimit] + jgArray := ubidi.jg[:] + for { + prev := uint8(0) + for start < limit { + jg := jgArray[0] + jgArray = jgArray[1:] + if jg != prev { + sa.AddRune(start) + prev = jg + } + start++ + } + if prev != 0 { + /* add the limit code point if the last value was not 0 (it is now start==limit) */ + sa.AddRune(limit) + } + if limit == ubidi.indexes[ixJgLimit] { + /* switch to the second Joining_Group range */ + start = ubidi.indexes[ixJgStart2] + limit = ubidi.indexes[ixJgLimit2] + jgArray = ubidi.jg2[:] + } else { + break + } + } + + /* add code points with hardcoded properties, plus the ones following them */ + + /* (none right now) */ +} + +func HasFlag(props uint16, shift int) bool { + return ((props >> shift) & 1) != 0 +} + +func mirrorCodePoint(m rune) rune { + return m & 0x1fffff +} + +func IsJoinControl(c rune) bool { + props := ubidi.trie.Get16(c) + return HasFlag(props, joinControlShift) +} + +func JoinType(c rune) JoiningType { + props := ubidi.trie.Get16(c) + return JoiningType((props & jtMask) >> jtShift) +} + +func JoinGroup(c rune) JoiningGroup { + start := ubidi.indexes[ixJgStart] + limit := ubidi.indexes[ixJgLimit] + if start <= c && c < limit { + return JoiningGroup(ubidi.jg[c-start]) + } + start = ubidi.indexes[ixJgStart2] + limit = ubidi.indexes[ixJgLimit2] + if start <= c && c < limit { + return JoiningGroup(ubidi.jg2[c-start]) + } + return JgNoJoiningGroup +} + +func IsMirrored(c rune) bool { + props := ubidi.trie.Get16(c) + return HasFlag(props, isMirroredShift) +} + +func IsBidiControl(c rune) bool { + props := ubidi.trie.Get16(c) + return HasFlag(props, bidiControlShift) +} + +func PairedBracketType(c rune) UPairedBracketType { + props := ubidi.trie.Get16(c) + return UPairedBracketType((props & bptMask) >> bptShift) +} + +func Class(c rune) CharDirection { + props := ubidi.trie.Get16(c) + return CharDirection(props & classMask) +} diff --git a/go/mysql/icuregex/internal/ucase/fold.go b/go/mysql/icuregex/internal/ucase/fold.go new file mode 100644 index 00000000000..88d4f026c65 --- /dev/null +++ b/go/mysql/icuregex/internal/ucase/fold.go @@ -0,0 +1,243 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ucase + +import ( + "math/bits" + + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +func FoldRunes(str []rune) []rune { + out := make([]rune, 0, len(str)) + for _, c := range str { + r, exp := FullFolding(c) + if exp == nil { + out = append(out, r) + continue + } + + for len(exp) > 0 { + r, exp = utf16.NextUnsafe(exp) + out = append(out, r) + } + } + return out +} + +/* + - Case folding is similar to lowercasing. + - The result may be a simple mapping, i.e., a single code point, or + - a full mapping, i.e., a string. + - If the case folding for a code point is the same as its simple (1:1) lowercase mapping, + - then only the lowercase mapping is stored. + * + - Some special cases are hardcoded because their conditions cannot be + - parsed and processed from CaseFolding.txt. + * + - Unicode 3.2 CaseFolding.txt specifies for its status field: + +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) + + - Unicode 3.2 has 'T' mappings as follows: + +0049; T; 0131; # LATIN CAPITAL LETTER I +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE + + - while the default mappings for these code points are: + +0049; C; 0069; # LATIN CAPITAL LETTER I +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE + + - U+0130 has no simple case folding (simple-case-folds to itself). +*/ +func Fold(c rune) rune { + props := ucase.trie.Get16(c) + if !hasException(props) { + if isUpperOrTitle(props) { + c += getDelta(props) + } + } else { + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + if (excWord & excConditionalFold) != 0 { + /* special case folding mappings, hardcoded */ + /* default mappings */ + if c == 0x49 { + /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ + return 0x69 + } else if c == 0x130 { + /* no simple case folding for U+0130 */ + return c + } + } + if (excWord & excNoSimpleCaseFolding) != 0 { + return c + } + if hasSlot(excWord, excDelta) && isUpperOrTitle(props) { + var delta int32 + delta, _ = getSlotValue(excWord, excDelta, pe) + if excWord&excDeltaIsNegative == 0 { + return c + delta + } + return c - delta + } + + var idx int32 + if hasSlot(excWord, excFold) { + idx = excFold + } else if hasSlot(excWord, excLower) { + idx = excLower + } else { + return c + } + c, _ = getSlotValue(excWord, idx, pe) + } + return c +} + +func FullFolding(c rune) (rune, []uint16) { + result := c + props := ucase.trie.Get16(c) + + if !hasException(props) { + if isUpperOrTitle(props) { + result = c + getDelta(props) + } + return result, nil + } + + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + var idx int32 + + if excWord&excConditionalFold != 0 { + /* use hardcoded conditions and mappings */ + /* default mappings */ + if c == 0x49 { + /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ + return 0x69, nil + } else if c == 0x130 { + /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ + return -1, []uint16{0x69, 0x307} + } + } else if hasSlot(excWord, excFullMappings) { + full, pe := getSlotValue(excWord, excFullMappings, pe) + + /* start of full case mapping strings */ + pe = pe[1:] + + /* skip the lowercase result string */ + pe = pe[full&fullLower:] + full = (full >> 4) & 0xf + + if full != 0 { + /* set the output pointer to the result string */ + return -1, pe[:full] + } + } + + if excWord&excNoSimpleCaseFolding != 0 { + return result, nil + } + if hasSlot(excWord, excDelta) && isUpperOrTitle(props) { + delta, _ := getSlotValue(excWord, excDelta, pe) + if excWord&excDeltaIsNegative == 0 { + return c + delta, nil + } + return c - delta, nil + } + if hasSlot(excWord, excFold) { + idx = excFold + } else if hasSlot(excWord, excLower) { + idx = excLower + } else { + return c, nil + } + result, _ = getSlotValue(excWord, idx, pe) + return result, nil +} + +const ( + excLower = iota + excFold + excUpper + excTitle + excDelta + exc5 /* reserved */ + excClosure + excFullMappings +) + +const ( + /* complex/conditional mappings */ + excConditionalSpecial = 0x4000 + excConditionalFold = 0x8000 + excNoSimpleCaseFolding = 0x200 + excDeltaIsNegative = 0x400 + excSensitive = 0x800 + + excDoubleSlots = 0x100 +) + +func isUpperOrTitle(props uint16) bool { + return props&2 != 0 +} + +func getDelta(props uint16) rune { + return rune(int16(props) >> 7) +} + +func getExceptions(props uint16) []uint16 { + return ucase.exceptions[props>>4:] +} + +func hasSlot(flags uint16, idx int32) bool { + return (flags & (1 << idx)) != 0 +} + +func slotOffset(flags uint16, idx int32) int { + return bits.OnesCount8(uint8(flags & ((1 << idx) - 1))) +} + +func getSlotValue(excWord uint16, idx int32, pExc16 []uint16) (int32, []uint16) { + if excWord&excDoubleSlots == 0 { + pExc16 = pExc16[slotOffset(excWord, idx):] + return int32(pExc16[0]), pExc16 + } + pExc16 = pExc16[2*slotOffset(excWord, idx):] + return (int32(pExc16[0]) << 16) | int32(pExc16[1]), pExc16[1:] +} diff --git a/go/mysql/icuregex/internal/ucase/ucase.go b/go/mysql/icuregex/internal/ucase/ucase.go new file mode 100644 index 00000000000..9fb8407ea66 --- /dev/null +++ b/go/mysql/icuregex/internal/ucase/ucase.go @@ -0,0 +1,425 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ucase + +import ( + "errors" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var ucase struct { + trie *utrie.UTrie2 + exceptions []uint16 + unfold []uint16 +} + +const ( + ixIndexTop = 0 + ixLength = 1 + ixTrieSize = 2 + ixExcLength = 3 + ixUnfoldLength = 4 + ixMaxFullLength = 15 + ixTop = 16 +) + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x63 && + info.DataFormat[1] == 0x41 && + info.DataFormat[2] == 0x53 && + info.DataFormat[3] == 0x45 && + info.FormatVersion[0] == 4 + }) + if err != nil { + return err + } + + count := int32(bytes.Uint32()) + if count < ixTop { + return errors.New("indexes[0] too small in ucase.icu") + } + + indexes := make([]int32, count) + indexes[0] = count + + for i := int32(1); i < count; i++ { + indexes[i] = int32(bytes.Uint32()) + } + + ucase.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := indexes[ixTrieSize] + trieLength := ucase.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + + if n := indexes[ixExcLength]; n > 0 { + ucase.exceptions = bytes.Uint16Slice(n) + } + if n := indexes[ixUnfoldLength]; n > 0 { + ucase.unfold = bytes.Uint16Slice(n) + } + + return nil +} + +func init() { + b := udata.NewBytes(icudata.UCase) + if err := readData(b); err != nil { + panic(err) + } +} + +type propertySet interface { + AddRune(ch rune) +} + +func AddPropertyStarts(sa propertySet) { + /* add the start code point of each same-value range of the trie */ + ucase.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) + + /* add code points with hardcoded properties, plus the ones following them */ + + /* (none right now, see comment below) */ + + /* + * Omit code points with hardcoded specialcasing properties + * because we do not build property UnicodeSets for them right now. + */ +} + +const ( + fullMappingsMaxLength = (4 * 0xf) + closureMaxLength = 0xf + + fullLower = 0xf + fullFolding = 0xf0 + fullUpper = 0xf00 + fullTitle = 0xf000 +) + +func AddCaseClosure(c rune, sa propertySet) { + /* + * Hardcode the case closure of i and its relatives and ignore the + * data file data for these characters. + * The Turkic dotless i and dotted I with their case mapping conditions + * and case folding option make the related characters behave specially. + * This code matches their closure behavior to their case folding behavior. + */ + + switch c { + case 0x49: + /* regular i and I are in one equivalence class */ + sa.AddRune(0x69) + return + case 0x69: + sa.AddRune(0x49) + return + case 0x130: + /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ + // the Regex engine calls removeAllStrings() on all UnicodeSets, so we don't need to insert them + // sa->addString(sa->set, iDot, 2); + return + case 0x131: + /* dotless i is in a class by itself */ + return + default: + /* otherwise use the data file data */ + break + } + + props := ucase.trie.Get16(c) + if !hasException(props) { + if getPropsType(props) != None { + /* add the one simple case mapping, no matter what type it is */ + delta := getDelta(props) + if delta != 0 { + sa.AddRune(c + delta) + } + } + } else { + /* + * c has exceptions, so there may be multiple simple and/or + * full case mappings. Add them all. + */ + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + var idx int32 + var closure []uint16 + + /* add all simple case mappings */ + for idx = excLower; idx <= excTitle; idx++ { + if hasSlot(excWord, idx) { + c, _ = getSlotValue(excWord, idx, pe) + sa.AddRune(c) + } + } + if hasSlot(excWord, excDelta) { + delta, _ := getSlotValue(excWord, excDelta, pe) + if excWord&excDeltaIsNegative == 0 { + sa.AddRune(c + delta) + } else { + sa.AddRune(c - delta) + } + } + + /* get the closure string pointer & length */ + if hasSlot(excWord, excClosure) { + closureLength, pe1 := getSlotValue(excWord, excClosure, pe) + closureLength &= closureMaxLength /* higher bits are reserved */ + closure = pe1[1 : 1+closureLength] /* behind this slot, unless there are full case mappings */ + } + + /* add the full case folding */ + if hasSlot(excWord, excFullMappings) { + fullLength, pe1 := getSlotValue(excWord, excFullMappings, pe) + + /* start of full case mapping strings */ + pe1 = pe1[1:] + + fullLength &= 0xffff /* bits 16 and higher are reserved */ + + /* skip the lowercase result string */ + pe1 = pe1[fullLength&fullLower:] + fullLength >>= 4 + + /* skip adding the case folding strings */ + length := fullLength & 0xf + pe1 = pe1[length:] + + /* skip the uppercase and titlecase strings */ + fullLength >>= 4 + pe1 = pe1[fullLength&0xf:] + fullLength >>= 4 + pe1 = pe1[fullLength:] + + closure = pe1[:len(closure)] + } + + /* add each code point in the closure string */ + for len(closure) > 0 { + c, closure = utf16.NextUnsafe(closure) + sa.AddRune(c) + } + } +} + +const dotMask = 0x60 + +const ( + noDot = 0 /* normal characters with cc=0 */ + softDotted = 0x20 /* soft-dotted characters with cc=0 */ + above = 0x40 /* "above" accents with cc=230 */ + otherAccent = 0x60 /* other accent character (0> excDotShift) & dotMask) +} + +func IsCaseSensitive(c rune) bool { + props := ucase.trie.Get16(c) + if !hasException(props) { + return (props & sensitive) != 0 + } + pe := getExceptions(props) + return (pe[0] & excSensitive) != 0 +} + +func ToFullLower(c rune) rune { + // The sign of the result has meaning, input must be non-negative so that it can be returned as is. + result := c + props := ucase.trie.Get16(c) + if !hasException(props) { + if isUpperOrTitle(props) { + result = c + getDelta(props) + } + } else { + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + + if excWord&excConditionalSpecial != 0 { + /* use hardcoded conditions and mappings */ + if c == 0x130 { + return 2 + } + /* no known conditional special case mapping, use a normal mapping */ + } else if hasSlot(excWord, excFullMappings) { + full, _ := getSlotValue(excWord, excFullMappings, pe) + full = full & fullLower + if full != 0 { + /* return the string length */ + return full + } + } + + if hasSlot(excWord, excDelta) && isUpperOrTitle(props) { + delta, _ := getSlotValue(excWord, excDelta, pe) + if (excWord & excDeltaIsNegative) == 0 { + return c + delta + } + return c - delta + } + if hasSlot(excWord, excLower) { + result, _ = getSlotValue(excWord, excLower, pe) + } + } + + if result == c { + return ^result + } + return result +} + +func ToFullUpper(c rune) rune { + return toUpperOrTitle(c, true) +} + +func ToFullTitle(c rune) rune { + return toUpperOrTitle(c, false) +} + +func toUpperOrTitle(c rune, upperNotTitle bool) rune { + result := c + props := ucase.trie.Get16(c) + if !hasException(props) { + if getPropsType(props) == Lower { + result = c + getDelta(props) + } + } else { + pe := getExceptions(props) + excWord := pe[0] + pe = pe[1:] + + if excWord&excConditionalSpecial != 0 { + if c == 0x0587 { + return 2 + } + /* no known conditional special case mapping, use a normal mapping */ + } else if hasSlot(excWord, excFullMappings) { + full, _ := getSlotValue(excWord, excFullMappings, pe) + + /* skip the lowercase and case-folding result strings */ + full >>= 8 + + if upperNotTitle { + full &= 0xf + } else { + /* skip the uppercase result string */ + full = (full >> 4) & 0xf + } + + if full != 0 { + /* return the string length */ + return full + } + } + + if hasSlot(excWord, excDelta) && getPropsType(props) == Lower { + delta, _ := getSlotValue(excWord, excDelta, pe) + if (excWord & excDeltaIsNegative) == 0 { + return c + delta + } + return c - delta + } + var idx int32 + if !upperNotTitle && hasSlot(excWord, excTitle) { + idx = excTitle + } else if hasSlot(excWord, excUpper) { + /* here, titlecase is same as uppercase */ + idx = excUpper + } else { + return ^c + } + result, _ = getSlotValue(excWord, idx, pe) + } + + if result == c { + return ^result + } + return result +} + +func GetTypeOrIgnorable(c rune) int32 { + props := ucase.trie.Get16(c) + return int32(props & 7) +} + +type Type int32 + +const ( + None Type = iota + Lower + Upper + Title +) + +const typeMask = 3 + +func GetType(c rune) Type { + props := ucase.trie.Get16(c) + return getPropsType(props) +} + +func getPropsType(props uint16) Type { + return Type(props & typeMask) +} diff --git a/go/mysql/icuregex/internal/uchar/constants.go b/go/mysql/icuregex/internal/uchar/constants.go new file mode 100644 index 00000000000..1ab96751b5c --- /dev/null +++ b/go/mysql/icuregex/internal/uchar/constants.go @@ -0,0 +1,240 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uchar + +import "golang.org/x/exp/constraints" + +func uMask[T constraints.Integer](x T) uint32 { + return 1 << x +} + +type Category int8 + +const ( + /* + * Note: UCharCategory constants and their API comments are parsed by preparseucd.py. + * It matches pairs of lines like + * / ** comment... * / + * U_<[A-Z_]+> = , + */ + + /** Non-category for unassigned and non-character code points. @stable ICU 2.0 */ + Unassigned Category = 0 + /** Cn "Other, Not Assigned (no characters in [UnicodeData.txt] have this property)" (same as U_UNASSIGNED!) @stable ICU 2.0 */ + GeneralOtherTypes Category = iota - 1 + /** Lu @stable ICU 2.0 */ + UppercaseLetter + /** Ll @stable ICU 2.0 */ + LowercaseLetter + /** Lt @stable ICU 2.0 */ + TitlecaseLetter + /** Lm @stable ICU 2.0 */ + ModifierLetter + /** Lo @stable ICU 2.0 */ + OtherLetter + /** Mn @stable ICU 2.0 */ + NonSpacingMask + /** Me @stable ICU 2.0 */ + EnclosingMark + /** Mc @stable ICU 2.0 */ + CombiningSpacingMask + /** Nd @stable ICU 2.0 */ + DecimalDigitNumber + /** Nl @stable ICU 2.0 */ + LetterNumber + /** No @stable ICU 2.0 */ + OtherNumber + /** Zs @stable ICU 2.0 */ + SpaceSeparator + /** Zl @stable ICU 2.0 */ + LineSeparator + /** Zp @stable ICU 2.0 */ + ParagraphSeparator + /** Cc @stable ICU 2.0 */ + ControlChar + /** Cf @stable ICU 2.0 */ + FormatChar + /** Co @stable ICU 2.0 */ + PrivateUseChar + /** Cs @stable ICU 2.0 */ + Surrogate + /** Pd @stable ICU 2.0 */ + DashPunctuation + /** Ps @stable ICU 2.0 */ + StartPunctuation + /** Pe @stable ICU 2.0 */ + EndPunctuation + /** Pc @stable ICU 2.0 */ + ConnectorPunctuation + /** Po @stable ICU 2.0 */ + OtherPunctuation + /** Sm @stable ICU 2.0 */ + MathSymbol + /** Sc @stable ICU 2.0 */ + CurrencySymbol + /** Sk @stable ICU 2.0 */ + ModifierSymbol + /** So @stable ICU 2.0 */ + OtherSymbol + /** Pi @stable ICU 2.0 */ + InitialPunctuation + /** Pf @stable ICU 2.0 */ + FinalPunctuation + /** + * One higher than the last enum UCharCategory constant. + * This numeric value is stable (will not change), see + * http://www.unicode.org/policies/stability_policy.html#Property_Value + * + * @stable ICU 2.0 + */ + CharCategoryCount +) + +var ( + GcCnMask = uMask(GeneralOtherTypes) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcLuMask = uMask(UppercaseLetter) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcLlMask = uMask(LowercaseLetter) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcLtMask = uMask(TitlecaseLetter) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcLmMask = uMask(ModifierLetter) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcLoMask = uMask(OtherLetter) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcMnMask = uMask(NonSpacingMask) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcMeMask = uMask(EnclosingMark) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcMcMask = uMask(CombiningSpacingMask) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcNdMask = uMask(DecimalDigitNumber) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcNlMask = uMask(LetterNumber) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcNoMask = uMask(OtherNumber) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcZsMask = uMask(SpaceSeparator) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcZlMask = uMask(LineSeparator) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcZpMask = uMask(ParagraphSeparator) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcCcMask = uMask(ControlChar) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcCfMask = uMask(FormatChar) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcCoMask = uMask(PrivateUseChar) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcCsMask = uMask(Surrogate) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcPdMask = uMask(DashPunctuation) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcPsMask = uMask(StartPunctuation) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcPeMask = uMask(EndPunctuation) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcPcMask = uMask(ConnectorPunctuation) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcPoMask = uMask(OtherPunctuation) + + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcSmMask = uMask(MathSymbol) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcScMask = uMask(CurrencySymbol) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcSkMask = uMask(ModifierSymbol) + /** Mask constant for a UCharCategory. @stable ICU 2.1 */ + GcSoMask = uMask(OtherSymbol) + + /** Mask constant for multiple UCharCategory bits (L Letters). @stable ICU 2.1 */ + GcLMask = (GcLuMask | GcLlMask | GcLtMask | GcLmMask | GcLoMask) + + /** Mask constant for multiple UCharCategory bits (LC Cased Letters). @stable ICU 2.1 */ + GcLcMask = (GcLuMask | GcLlMask | GcLtMask) + + /** Mask constant for multiple UCharCategory bits (M Marks). @stable ICU 2.1 */ + GcMMask = (GcMnMask | GcMeMask | GcMcMask) + + /** Mask constant for multiple UCharCategory bits (N Numbers). @stable ICU 2.1 */ + GcNMask = (GcNdMask | GcNlMask | GcNoMask) + + /** Mask constant for multiple UCharCategory bits (Z Separators). @stable ICU 2.1 */ + GcZMask = (GcZsMask | GcZlMask | GcZpMask) +) + +const upropsAgeShift = 24 +const maxVersionLength = 4 +const versionDelimiter = '.' + +type UVersionInfo [maxVersionLength]uint8 + +const ( + /** No numeric value. */ + UPropsNtvNone = 0 + /** Decimal digits: nv=0..9 */ + UPropsNtvDecimalStart = 1 + /** Other digits: nv=0..9 */ + UPropsNtvDigitStart = 11 + /** Small integers: nv=0..154 */ + UPropsNtvNumericStart = 21 + /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ + UPropsNtvFractionStart = 0xb0 + /** + * Large integers: + * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) + * (only one significant decimal digit) + */ + UPropsNtvLargeStart = 0x1e0 + /** + * Sexagesimal numbers: + * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) + */ + UPropsNtvBase60Start = 0x300 + /** + * Fraction-20 values: + * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 + * numerator: num = 2*(frac20&3)+1 + * denominator: den = 20<<(frac20>>2) + */ + UPropsNtvFraction20Start = UPropsNtvBase60Start + 36 // 0x300+9*4=0x324 + /** + * Fraction-32 values: + * frac32 = ntv-0x34c = 0..15 -> 1|3|5|7 / 32|64|128|256 + * numerator: num = 2*(frac32&3)+1 + * denominator: den = 32<<(frac32>>2) + */ + UPropsNtvFraction32Start = UPropsNtvFraction20Start + 24 // 0x324+6*4=0x34c + /** No numeric value (yet). */ + UPropsNtvReservedStart = UPropsNtvFraction32Start + 16 // 0x34c+4*4=0x35c + + UPropsNtvMaxSmallInt = UPropsNtvFractionStart - UPropsNtvNumericStart - 1 +) + +const noNumericValue = -123456789.0 diff --git a/go/mysql/icuregex/internal/uchar/uchar.go b/go/mysql/icuregex/internal/uchar/uchar.go new file mode 100644 index 00000000000..a2c758ea1c0 --- /dev/null +++ b/go/mysql/icuregex/internal/uchar/uchar.go @@ -0,0 +1,405 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uchar + +import ( + "errors" + "strconv" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var uprops struct { + trie *utrie.UTrie2 + trie2 *utrie.UTrie2 + vectorsColumns int32 + vectors []uint32 + scriptExtensions []uint16 +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x55 && + info.DataFormat[1] == 0x50 && + info.DataFormat[2] == 0x72 && + info.DataFormat[3] == 0x6f && + info.FormatVersion[0] == 7 + }) + if err != nil { + return err + } + + propertyOffset := bytes.Int32() + /* exceptionOffset = */ bytes.Int32() + /* caseOffset = */ bytes.Int32() + additionalOffset := bytes.Int32() + additionalVectorsOffset := bytes.Int32() + uprops.vectorsColumns = bytes.Int32() + scriptExtensionsOffset := bytes.Int32() + reservedOffset7 := bytes.Int32() + /* reservedOffset8 = */ bytes.Int32() + /* dataTopOffset = */ bytes.Int32() + _ = bytes.Int32() + _ = bytes.Int32() + bytes.Skip((16 - 12) << 2) + + uprops.trie, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength := (propertyOffset - 16) * 4 + trieLength := uprops.trie.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + bytes.Skip((additionalOffset - propertyOffset) * 4) + + if uprops.vectorsColumns > 0 { + uprops.trie2, err = utrie.UTrie2FromBytes(bytes) + if err != nil { + return err + } + + expectedTrieLength = (additionalVectorsOffset - additionalOffset) * 4 + trieLength = uprops.trie2.SerializedLength() + + if trieLength > expectedTrieLength { + return errors.New("ucase.icu: not enough bytes for the trie") + } + + bytes.Skip(expectedTrieLength - trieLength) + uprops.vectors = bytes.Uint32Slice(scriptExtensionsOffset - additionalVectorsOffset) + } + + if n := (reservedOffset7 - scriptExtensionsOffset) * 2; n > 0 { + uprops.scriptExtensions = bytes.Uint16Slice(n) + } + + return nil +} + +func init() { + b := udata.NewBytes(icudata.UProps) + if err := readData(b); err != nil { + panic(err) + } +} + +type PropertySet interface { + AddRune(ch rune) +} + +func VecAddPropertyStarts(sa PropertySet) { + uprops.trie2.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) +} + +const ( + tab = 0x0009 + lf = 0x000a + ff = 0x000c + cr = 0x000d + nbsp = 0x00a0 + cgj = 0x034f + figuresp = 0x2007 + hairsp = 0x200a + zwnj = 0x200c + zwj = 0x200d + rlm = 0x200f + nnbsp = 0x202f + zwnbsp = 0xfef +) + +func AddPropertyStarts(sa PropertySet) { + /* add the start code point of each same-value range of the main trie */ + uprops.trie.Enum(nil, func(start, _ rune, _ uint32) bool { + sa.AddRune(start) + return true + }) + + /* add code points with hardcoded properties, plus the ones following them */ + + /* add for u_isblank() */ + sa.AddRune(tab) + sa.AddRune(tab + 1) + + /* add for IS_THAT_CONTROL_SPACE() */ + sa.AddRune(cr + 1) /* range TAB..CR */ + sa.AddRune(0x1c) + sa.AddRune(0x1f + 1) + sa.AddRune(0x85) // NEXT LINE (NEL) + sa.AddRune(0x85 + 1) + + /* add for u_isIDIgnorable() what was not added above */ + sa.AddRune(0x7f) /* range DEL..NBSP-1, NBSP added below */ + sa.AddRune(hairsp) + sa.AddRune(rlm + 1) + sa.AddRune(0x206a) // INHIBIT SYMMETRIC SWAPPING + sa.AddRune(0x206f + 1) // NOMINAL DIGIT SHAPES + sa.AddRune(zwnbsp) + sa.AddRune(zwnbsp + 1) + + /* add no-break spaces for u_isWhitespace() what was not added above */ + sa.AddRune(nbsp) + sa.AddRune(nbsp + 1) + sa.AddRune(figuresp) + sa.AddRune(figuresp + 1) + sa.AddRune(nnbsp) + sa.AddRune(nnbsp + 1) + + /* add for u_digit() */ + sa.AddRune('a') + sa.AddRune('z' + 1) + sa.AddRune('A') + sa.AddRune('Z' + 1) + // fullwidth + sa.AddRune('a') + sa.AddRune('z' + 1) + sa.AddRune('A') + sa.AddRune('Z' + 1) + + /* add for u_isxdigit() */ + sa.AddRune('f' + 1) + sa.AddRune('F' + 1) + // fullwidth + sa.AddRune('f' + 1) + sa.AddRune('F' + 1) + + /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ + sa.AddRune(0x2060) /* range 2060..206f */ + sa.AddRune(0xfff0) + sa.AddRune(0xfffb + 1) + sa.AddRune(0xe0000) + sa.AddRune(0xe0fff + 1) + + /* add for UCHAR_GRAPHEME_BASE and others */ + sa.AddRune(cgj) + sa.AddRune(cgj + 1) +} + +func CharType(c rune) Category { + props := uprops.trie.Get16(c) + return getCategory(props) +} + +func GetProperties(c rune) uint16 { + return uprops.trie.Get16(c) +} + +func getCategory(props uint16) Category { + return Category(props & 0x1f) +} + +func GetUnicodeProperties(c rune, column int) uint32 { + if column >= int(uprops.vectorsColumns) { + return 0 + } + vecIndex := uprops.trie2.Get16(c) + return uprops.vectors[int(vecIndex)+column] +} + +func ScriptExtension(idx uint32) uint16 { + return uprops.scriptExtensions[idx] +} + +func ScriptExtensions(idx uint32) []uint16 { + return uprops.scriptExtensions[idx:] +} + +func IsDigit(c rune) bool { + return CharType(c) == DecimalDigitNumber +} + +func IsPOSIXPrint(c rune) bool { + return CharType(c) == SpaceSeparator || IsGraphPOSIX(c) +} + +func IsGraphPOSIX(c rune) bool { + props := uprops.trie.Get16(c) + /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ + /* comparing ==0 returns FALSE for the categories mentioned */ + return uMask(getCategory(props))&(GcCcMask|GcCsMask|GcCnMask|GcZMask) == 0 +} + +func IsXDigit(c rune) bool { + /* check ASCII and Fullwidth ASCII a-fA-F */ + if (c <= 0x66 && c >= 0x41 && (c <= 0x46 || c >= 0x61)) || + (c >= 0xff21 && c <= 0xff46 && (c <= 0xff26 || c >= 0xff41)) { + return true + } + return IsDigit(c) +} + +func IsBlank(c rune) bool { + if c <= 0x9f { + return c == 9 || c == 0x20 /* TAB or SPACE */ + } + /* Zs */ + return CharType(c) == SpaceSeparator +} + +func CharAge(c rune) UVersionInfo { + version := GetUnicodeProperties(c, 0) >> upropsAgeShift + return UVersionInfo{uint8(version >> 4), uint8(version & 0xf), 0, 0} +} + +func VersionFromString(str string) (version UVersionInfo) { + part := 0 + for len(str) > 0 && part < maxVersionLength { + if str[0] == versionDelimiter { + str = str[1:] + } + str, version[part] = parseInt(str) + part++ + } + return +} + +// parseInt is simplified but aims to mimic strtoul usage +// as it is used for ICU version parsing. +func parseInt(str string) (string, uint8) { + if str == "" { + return str, 0 + } + + start := 0 + end := 0 +whitespace: + for i := 0; i < len(str); i++ { + switch str[i] { + case ' ', '\f', '\n', '\r', '\t', '\v': + start++ + continue + default: + break whitespace + } + } + str = str[start:] + + for i := 0; i < len(str); i++ { + if str[i] < '0' || str[i] > '9' { + end = i + break + } + end++ + } + + val, err := strconv.ParseUint(str[start:end], 10, 8) + if err != nil { + return str[end:], 0 + } + return str[end:], uint8(val) +} + +const upropsNumericTypeValueShift = 6 + +func NumericTypeValue(c rune) uint16 { + props := uprops.trie.Get16(c) + return props >> upropsNumericTypeValueShift +} + +func NumericValue(c rune) float64 { + ntv := int32(NumericTypeValue(c)) + + if ntv == UPropsNtvNone { + return noNumericValue + } else if ntv < UPropsNtvDigitStart { + /* decimal digit */ + return float64(ntv - UPropsNtvDecimalStart) + } else if ntv < UPropsNtvNumericStart { + /* other digit */ + return float64(ntv - UPropsNtvDigitStart) + } else if ntv < UPropsNtvFractionStart { + /* small integer */ + return float64(ntv - UPropsNtvNumericStart) + } else if ntv < UPropsNtvLargeStart { + /* fraction */ + numerator := (ntv >> 4) - 12 + denominator := (ntv & 0xf) + 1 + return float64(numerator) / float64(denominator) + } else if ntv < UPropsNtvBase60Start { + /* large, single-significant-digit integer */ + mant := (ntv >> 5) - 14 + exp := (ntv & 0x1f) + 2 + numValue := float64(mant) + + /* multiply by 10^exp without math.h */ + for exp >= 4 { + numValue *= 10000. + exp -= 4 + } + switch exp { + case 3: + numValue *= 1000.0 + case 2: + numValue *= 100.0 + case 1: + numValue *= 10.0 + case 0: + default: + } + + return numValue + } else if ntv < UPropsNtvFraction20Start { + /* sexagesimal (base 60) integer */ + numValue := (ntv >> 2) - 0xbf + exp := (ntv & 3) + 1 + + switch exp { + case 4: + numValue *= 60 * 60 * 60 * 60 + case 3: + numValue *= 60 * 60 * 60 + case 2: + numValue *= 60 * 60 + case 1: + numValue *= 60 + case 0: + default: + } + + return float64(numValue) + } else if ntv < UPropsNtvFraction32Start { + // fraction-20 e.g. 3/80 + frac20 := ntv - UPropsNtvFraction20Start // 0..0x17 + numerator := 2*(frac20&3) + 1 + denominator := 20 << (frac20 >> 2) + return float64(numerator) / float64(denominator) + } else if ntv < UPropsNtvReservedStart { + // fraction-32 e.g. 3/64 + frac32 := ntv - UPropsNtvFraction32Start // 0..15 + numerator := 2*(frac32&3) + 1 + denominator := 32 << (frac32 >> 2) + return float64(numerator) / float64(denominator) + } else { + /* reserved */ + return noNumericValue + } +} diff --git a/go/mysql/icuregex/internal/udata/udata.go b/go/mysql/icuregex/internal/udata/udata.go new file mode 100644 index 00000000000..f20f8be1efa --- /dev/null +++ b/go/mysql/icuregex/internal/udata/udata.go @@ -0,0 +1,155 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package udata + +import ( + "encoding/binary" + "errors" + "unsafe" +) + +type DataInfo struct { + /** sizeof(UDataInfo) + * @stable ICU 2.0 */ + Size uint16 + + /** unused, set to 0 + * @stable ICU 2.0*/ + ReservedWord uint16 + + /* platform data properties */ + /** 0 for little-endian machine, 1 for big-endian + * @stable ICU 2.0 */ + IsBigEndian uint8 + + /** see U_CHARSET_FAMILY values in utypes.h + * @stable ICU 2.0*/ + CharsetFamily uint8 + + /** sizeof(UChar), one of { 1, 2, 4 } + * @stable ICU 2.0*/ + SizeofUChar uint8 + + /** unused, set to 0 + * @stable ICU 2.0*/ + ReservedByte uint8 + + /** data format identifier + * @stable ICU 2.0*/ + DataFormat [4]uint8 + + /** versions: [0] major [1] minor [2] milli [3] micro + * @stable ICU 2.0*/ + FormatVersion [4]uint8 + + /** versions: [0] major [1] minor [2] milli [3] micro + * @stable ICU 2.0*/ + DataVersion [4]uint8 +} + +type Bytes struct { + buf []byte + orig []byte + enc binary.ByteOrder +} + +func NewBytes(b []byte) *Bytes { + return &Bytes{buf: b, orig: b, enc: binary.LittleEndian} +} + +func (b *Bytes) ReadHeader(isValid func(info *DataInfo) bool) error { + type MappedData struct { + headerSize uint16 + magic1 uint8 + magic2 uint8 + } + + type DataHeader struct { + dataHeader MappedData + info DataInfo + } + + data := unsafe.SliceData(b.buf) + header := (*DataHeader)(unsafe.Pointer(data)) + + if header.dataHeader.magic1 != 0xda || header.dataHeader.magic2 != 0x27 { + return errors.New("invalid magic number") + } + + if header.info.IsBigEndian != 0 { + return errors.New("unsupported: BigEndian data source") + } + + if !isValid(&header.info) { + return errors.New("failed to validate data header") + } + + b.buf = b.buf[header.dataHeader.headerSize:] + return nil +} + +func (b *Bytes) Uint8() uint8 { + u := b.buf[0] + b.buf = b.buf[1:] + return u +} +func (b *Bytes) Uint16() uint16 { + u := b.enc.Uint16(b.buf) + b.buf = b.buf[2:] + return u +} + +func (b *Bytes) Uint16Slice(size int32) []uint16 { + s := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b.buf))), size) + b.buf = b.buf[2*size:] + return s +} + +func (b *Bytes) Uint32Slice(size int32) []uint32 { + s := unsafe.Slice((*uint32)(unsafe.Pointer(unsafe.SliceData(b.buf))), size) + b.buf = b.buf[4*size:] + return s +} + +func (b *Bytes) Uint32() uint32 { + u := b.enc.Uint32(b.buf) + b.buf = b.buf[4:] + return u +} + +func (b *Bytes) Int32() int32 { + return int32(b.Uint32()) +} + +func (b *Bytes) Skip(size int32) { + b.buf = b.buf[size:] +} + +func (b *Bytes) Uint8Slice(n int32) []uint8 { + s := b.buf[:n] + b.buf = b.buf[n:] + return s +} + +func (b *Bytes) Position() int32 { + return int32(len(b.orig) - len(b.buf)) +} diff --git a/go/mysql/icuregex/internal/ulayout/ulayout.go b/go/mysql/icuregex/internal/ulayout/ulayout.go new file mode 100644 index 00000000000..dbf21d9460b --- /dev/null +++ b/go/mysql/icuregex/internal/ulayout/ulayout.go @@ -0,0 +1,128 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ulayout + +import ( + "errors" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utrie" +) + +var inpcTrie *utrie.UcpTrie +var inscTrie *utrie.UcpTrie +var voTrie *utrie.UcpTrie + +const ( + ixInpcTrieTop = 1 + ixInscTrieTop = 2 + ixVoTrieTop = 3 + + ixCount = 12 +) + +func InpcTrie() *utrie.UcpTrie { + loadLayouts() + return inpcTrie +} + +func InscTrie() *utrie.UcpTrie { + loadLayouts() + return inscTrie +} + +func VoTrie() *utrie.UcpTrie { + loadLayouts() + return voTrie +} + +var layoutsOnce sync.Once + +func loadLayouts() { + layoutsOnce.Do(func() { + b := udata.NewBytes(icudata.ULayout) + if err := readData(b); err != nil { + panic(err) + } + }) +} + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x4c && + info.DataFormat[1] == 0x61 && + info.DataFormat[2] == 0x79 && + info.DataFormat[3] == 0x6f && + info.FormatVersion[0] == 1 + }) + if err != nil { + return err + } + + startPos := bytes.Position() + indexesLength := int32(bytes.Uint32()) // inIndexes[IX_INDEXES_LENGTH] + if indexesLength < ixCount { + return errors.New("text layout properties data: not enough indexes") + } + index := make([]int32, indexesLength) + index[0] = indexesLength + for i := int32(1); i < indexesLength; i++ { + index[i] = int32(bytes.Uint32()) + } + + offset := indexesLength * 4 + top := index[ixInpcTrieTop] + trieSize := top - offset + if trieSize >= 16 { + inpcTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + } + + pos := bytes.Position() - startPos + bytes.Skip(top - pos) + offset = top + top = index[ixInscTrieTop] + trieSize = top - offset + if trieSize >= 16 { + inscTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + } + + pos = bytes.Position() - startPos + bytes.Skip(top - pos) + offset = top + top = index[ixVoTrieTop] + trieSize = top - offset + if trieSize >= 16 { + voTrie, err = utrie.UcpTrieFromBytes(bytes) + if err != nil { + return err + } + } + return nil +} diff --git a/go/mysql/icuregex/internal/unames/unames.go b/go/mysql/icuregex/internal/unames/unames.go new file mode 100644 index 00000000000..45920be8292 --- /dev/null +++ b/go/mysql/icuregex/internal/unames/unames.go @@ -0,0 +1,471 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package unames + +import ( + "bytes" + "strconv" + "strings" + "sync" + + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +var charNamesOnce sync.Once +var charNames *unames + +type unames struct { + tokens []uint16 + tokenStrings []uint8 + groups []uint16 + groupNames []uint8 + algNames []algorithmicRange +} + +func loadCharNames() { + charNamesOnce.Do(func() { + b := udata.NewBytes(icudata.UNames) + if err := b.ReadHeader(func(info *udata.DataInfo) bool { + return info.Size >= 20 && + info.IsBigEndian == 0 && + info.CharsetFamily == 0 && + info.DataFormat[0] == 0x75 && /* dataFormat="unam" */ + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 1 + }); err != nil { + panic(err) + } + + tokenStringOffset := int32(b.Uint32() - 16) + groupsOffset := int32(b.Uint32() - 16) + groupStringOffset := int32(b.Uint32() - 16) + algNamesOffset := int32(b.Uint32() - 16) + charNames = &unames{ + tokens: b.Uint16Slice(tokenStringOffset / 2), + tokenStrings: b.Uint8Slice(groupsOffset - tokenStringOffset), + groups: b.Uint16Slice((groupStringOffset - groupsOffset) / 2), + groupNames: b.Uint8Slice(algNamesOffset - groupStringOffset), + } + + algCount := b.Uint32() + charNames.algNames = make([]algorithmicRange, 0, algCount) + + for i := uint32(0); i < algCount; i++ { + ar := algorithmicRange{ + start: b.Uint32(), + end: b.Uint32(), + typ: b.Uint8(), + variant: b.Uint8(), + } + size := b.Uint16() + switch ar.typ { + case 0: + ar.s = b.Uint8Slice(int32(size) - 12) + case 1: + ar.factors = b.Uint16Slice(int32(ar.variant)) + ar.s = b.Uint8Slice(int32(size) - 12 - int32(ar.variant)*2) + } + charNames.algNames = append(charNames.algNames, ar) + } + }) +} + +func (names *unames) getGroupName(group []uint16) []uint8 { + return names.groupNames[names.getGroupOffset(group):] +} + +type NameChoice int32 + +const ( + UnicodeCharName NameChoice = iota + /** + * The Unicode_1_Name property value which is of little practical value. + * Beginning with ICU 49, ICU APIs return an empty string for this name choice. + * @deprecated ICU 49 + */ + Unicode10CharName + /** Standard or synthetic character name. @stable ICU 2.0 */ + ExtendedCharName + /** Corrected name from NameAliases.txt. @stable ICU 4.4 */ + CharNameAlias +) + +type algorithmicRange struct { + start, end uint32 + typ, variant uint8 + factors []uint16 + s []uint8 +} + +func (ar *algorithmicRange) findAlgName(otherName string) rune { + switch ar.typ { + case 0: + s := ar.s + + for s[0] != 0 && len(otherName) > 0 { + if s[0] != otherName[0] { + return -1 + } + s = s[1:] + otherName = otherName[1:] + } + + var code rune + count := int(ar.variant) + for i := 0; i < count && len(otherName) > 0; i++ { + c := rune(otherName[0]) + otherName = otherName[1:] + if '0' <= c && c <= '9' { + code = (code << 4) | (c - '0') + } else if 'A' <= c && c <= 'F' { + code = (code << 4) | (c - 'A' + 10) + } else { + return -1 + } + } + + if len(otherName) == 0 && ar.start <= uint32(code) && uint32(code) <= ar.end { + return code + } + case 1: + factors := ar.factors + s := ar.s + + for s[0] != 0 && len(otherName) > 0 { + if s[0] != otherName[0] { + return -1 + } + s = s[1:] + otherName = otherName[1:] + } + s = s[1:] + + start := rune(ar.start) + limit := rune(ar.end + 1) + + var indexes [8]uint16 + var buf strings.Builder + var elements [8][]byte + var elementBases [8][]byte + + ar.writeFactorSuffix0(factors, s, &buf, &elements, &elementBases) + if buf.String() == otherName { + return start + } + + for start+1 < limit { + start++ + i := len(factors) + + for { + i-- + idx := indexes[i] + 1 + if idx < factors[i] { + indexes[i] = idx + s = elements[i] + s = s[bytes.IndexByte(s, 0)+1:] + elements[i] = s + break + } + + indexes[i] = 0 + elements[i] = elementBases[i] + } + + t := otherName + for i = 0; i < len(factors); i++ { + s = elements[i] + + for s[0] != 0 && len(t) > 0 { + if s[0] != t[0] { + s = nil + i = 99 + break + } + s = s[1:] + t = t[1:] + } + } + if i < 99 && len(t) == 0 { + return start + } + } + } + return -1 +} + +func (ar *algorithmicRange) writeFactorSuffix0(factors []uint16, s []uint8, buf *strings.Builder, elements, elementBases *[8][]byte) { + /* write each element */ + for i := 0; i < len(factors); i++ { + (*elements)[i] = s + (*elementBases)[i] = s + + nul := bytes.IndexByte(s, 0) + buf.Write(s[:nul]) + s = s[nul+1:] + + factor := int(factors[i] - 1) + for factor > 0 { + s = s[bytes.IndexByte(s, 0)+1:] + factor-- + } + } +} + +func CharForName(nameChoice NameChoice, name string) rune { + loadCharNames() + + lower := strings.ToLower(name) + upper := strings.ToUpper(name) + + if lower[0] == '<' { + if nameChoice == ExtendedCharName && lower[len(lower)-1] == '>' { + if limit := strings.LastIndexByte(lower, '-'); limit >= 2 { + cp, err := strconv.ParseUint(lower[limit+1:len(lower)-1], 16, 32) + if err != nil || cp > 0x10ffff { + return -1 + } + return rune(cp) + } + } + return -1 + } + + for _, ar := range charNames.algNames { + if cp := ar.findAlgName(upper); cp != -1 { + return cp + } + } + + return charNames.enumNames(0, 0x10ffff+1, upper, nameChoice) +} + +const groupShift = 5 +const linesPerGroup = 1 << groupShift +const groupMask = linesPerGroup - 1 + +const ( + groupMsb = iota + groupOffsetHigh + groupOffsetLow + groupLength +) + +func (names *unames) enumNames(start, limit rune, otherName string, nameChoice NameChoice) rune { + startGroupMSB := uint16(start >> groupShift) + endGroupMSB := uint16((limit - 1) >> groupShift) + + group := names.getGroup(start) + + if startGroupMSB < group[groupMsb] && nameChoice == ExtendedCharName { + extLimit := rune(group[groupMsb]) << groupShift + if extLimit > limit { + extLimit = limit + } + start = extLimit + } + + if startGroupMSB == endGroupMSB { + if startGroupMSB == group[groupMsb] { + return names.enumGroupNames(group, start, limit-1, otherName, nameChoice) + } + } else { + if startGroupMSB == group[groupMsb] { + if start&groupMask != 0 { + if cp := names.enumGroupNames(group, start, (rune(startGroupMSB)< group[groupMsb] { + group = group[groupLength:] + } + + for len(group) > 0 && group[groupMsb] < endGroupMSB { + start = rune(group[groupMsb]) << groupShift + if cp := names.enumGroupNames(group, start, start+linesPerGroup-1, otherName, nameChoice); cp != -1 { + return cp + } + group = group[groupLength:] + } + + if len(group) > 0 && group[groupMsb] == endGroupMSB { + return names.enumGroupNames(group, (limit-1)&^groupMask, limit-1, otherName, nameChoice) + } + } + + return -1 +} + +func (names *unames) getGroup(code rune) []uint16 { + groups := names.groups + groupMSB := uint16(code >> groupShift) + + start := 0 + groupCount := int(groups[0]) + limit := groupCount + groups = groups[1:] + + for start < limit-1 { + number := (start + limit) / 2 + if groupMSB < groups[number*groupLength+groupMsb] { + limit = number + } else { + start = number + } + } + + return groups[start*groupLength : (groupCount-start)*groupLength] +} + +func (names *unames) getGroupOffset(group []uint16) uint32 { + return (uint32(group[groupOffsetHigh]) << 16) | uint32(group[groupOffsetLow]) +} + +func (names *unames) enumGroupNames(group []uint16, start, end rune, otherName string, choice NameChoice) rune { + var offsets [linesPerGroup + 2]uint16 + var lengths [linesPerGroup + 2]uint16 + + s := names.getGroupName(group) + s = expandGroupLengths(s, offsets[:0], lengths[:0]) + + for start < end { + name := s[offsets[start&groupMask]:] + nameLen := lengths[start&groupMask] + if names.compareName(name[:nameLen], choice, otherName) { + return start + } + start++ + } + return -1 +} + +func expandGroupLengths(s []uint8, offsets []uint16, lengths []uint16) []uint8 { + /* read the lengths of the 32 strings in this group and get each string's offset */ + var i, offset, length uint16 + var lengthByte uint8 + + /* all 32 lengths must be read to get the offset of the first group string */ + for i < linesPerGroup { + lengthByte = s[0] + s = s[1:] + + /* read even nibble - MSBs of lengthByte */ + if length >= 12 { + /* double-nibble length spread across two bytes */ + length = ((length&0x3)<<4 | uint16(lengthByte)>>4) + 12 + lengthByte &= 0xf + } else if (lengthByte /* &0xf0 */) >= 0xc0 { + /* double-nibble length spread across this one byte */ + length = (uint16(lengthByte) & 0x3f) + 12 + } else { + /* single-nibble length in MSBs */ + length = uint16(lengthByte) >> 4 + lengthByte &= 0xf + } + + offsets = append(offsets, offset) + lengths = append(lengths, length) + + offset += length + i++ + + /* read odd nibble - LSBs of lengthByte */ + if (lengthByte & 0xf0) == 0 { + /* this nibble was not consumed for a double-nibble length above */ + length = uint16(lengthByte) + if length < 12 { + /* single-nibble length in LSBs */ + offsets = append(offsets, offset) + lengths = append(lengths, length) + + offset += length + i++ + } + } else { + length = 0 /* prevent double-nibble detection in the next iteration */ + } + } + + /* now, s is at the first group string */ + return s +} + +func (names *unames) compareName(name []byte, choice NameChoice, otherName string) bool { + tokens := names.tokens + + tokenCount := tokens[0] + tokens = tokens[1:] + + otherNameLen := len(otherName) + + for len(name) > 0 && len(otherName) > 0 { + c := name[0] + name = name[1:] + + if uint16(c) >= tokenCount { + if c != ';' { + if c != otherName[0] { + return false + } + otherName = otherName[1:] + } else { + break + } + } else { + token := tokens[c] + if int16(token) == -2 { + token = tokens[int(c)<<8|int(name[0])] + name = name[1:] + } + if int16(token) == -1 { + if c != ';' { + if c != otherName[0] { + return false + } + otherName = otherName[1:] + } else { + if len(otherName) == otherNameLen && choice == ExtendedCharName { + if ';' >= tokenCount || int16(tokens[';']) == -1 { + continue + } + } + break + } + } else { + tokenString := names.tokenStrings[token:] + for tokenString[0] != 0 && len(otherName) > 0 { + if tokenString[0] != otherName[0] { + return false + } + tokenString = tokenString[1:] + otherName = otherName[1:] + } + } + } + } + + return len(otherName) == 0 +} diff --git a/go/mysql/icuregex/internal/unames/unames_test.go b/go/mysql/icuregex/internal/unames/unames_test.go new file mode 100644 index 00000000000..f15353eef8d --- /dev/null +++ b/go/mysql/icuregex/internal/unames/unames_test.go @@ -0,0 +1,64 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package unames + +import ( + "testing" +) + +func TestCharForName(t *testing.T) { + var TestNames = []struct { + code rune + name, oldName, extName string + }{ + {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"}, + {0x01a2, "LATIN CAPITAL LETTER OI", "", "LATIN CAPITAL LETTER OI"}, + {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "", "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK"}, + {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "", "TIBETAN MARK BSKA- SHOG GI MGO RGYAN"}, + {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401"}, + {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED"}, + {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA"}, + {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH"}, + {0xd800, "", "", ""}, + {0xdc00, "", "", ""}, + {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS"}, + {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN"}, + {0xffff, "", "", ""}, + {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "", "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"}, + {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456"}, + } + + for _, tn := range TestNames { + if tn.name != "" { + r := CharForName(UnicodeCharName, tn.name) + if r != tn.code { + t.Errorf("CharFromName(U_UNICODE_CHAR_NAME, %q) = '%c' (U+%d), expected %c (U+%d)", tn.name, r, r, tn.code, tn.code) + } + } + if tn.extName != "" { + r := CharForName(ExtendedCharName, tn.extName) + if r != tn.code { + t.Errorf("CharFromName(U_EXTENDED_CHAR_NAME, %q) = '%c' (U+%d), expected %c (U+%d)", tn.extName, r, r, tn.code, tn.code) + } + } + } +} diff --git a/go/mysql/icuregex/internal/uprops/constants.go b/go/mysql/icuregex/internal/uprops/constants.go new file mode 100644 index 00000000000..3cfe250599a --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/constants.go @@ -0,0 +1,613 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +type Property int32 + +const ( + /* + * Note: UProperty constants are parsed by preparseucd.py. + * It matches lines like + * UCHAR_=, + */ + + /* Note: Place UCHAR_ALPHABETIC before UCHAR_BINARY_START so that + debuggers display UCHAR_ALPHABETIC as the symbolic name for 0, + rather than UCHAR_BINARY_START. Likewise for other *_START + identifiers. */ + + /** Binary property Alphabetic. Same as u_isUAlphabetic, different from u_isalpha. + Lu+Ll+Lt+Lm+Lo+Nl+Other_Alphabetic @stable ICU 2.1 */ + UCharAlphabetic Property = 0 + /** First constant for binary Unicode properties. @stable ICU 2.1 */ + UCharBinaryStart = UCharAlphabetic + /** Binary property ASCII_Hex_Digit. 0-9 A-F a-f @stable ICU 2.1 */ + UCharASCIIHexDigit Property = 1 + /** Binary property Bidi_Control. + Format controls which have specific functions + in the Bidi Algorithm. @stable ICU 2.1 */ + UCharBidiControl Property = 2 + /** Binary property Bidi_Mirrored. + Characters that may change display in RTL text. + Same as u_isMirrored. + See Bidi Algorithm, UTR 9. @stable ICU 2.1 */ + UCharBidiMirrored Property = 3 + /** Binary property Dash. Variations of dashes. @stable ICU 2.1 */ + UCharDash Property = 4 + /** Binary property Default_Ignorable_Code_Point (new in Unicode 3.2). + Ignorable in most processing. + <2060..206F, FFF0..FFFB, E0000..E0FFF>+Other_Default_Ignorable_Code_Point+(Cf+Cc+Cs-White_Space) @stable ICU 2.1 */ + UCharDefaultIgnorableCodePoint Property = 5 + /** Binary property Deprecated (new in Unicode 3.2). + The usage of deprecated characters is strongly discouraged. @stable ICU 2.1 */ + UCharDeprecated Property = 6 + /** Binary property Diacritic. Characters that linguistically modify + the meaning of another character to which they apply. @stable ICU 2.1 */ + UCharDiacritic Property = 7 + /** Binary property Extender. + Extend the value or shape of a preceding alphabetic character, + e.g., length and iteration marks. @stable ICU 2.1 */ + UCharExtender Property = 8 + /** Binary property Full_Composition_Exclusion. + CompositionExclusions.txt+Singleton Decompositions+ + Non-Starter Decompositions. @stable ICU 2.1 */ + UCharFullCompositionExclusion Property = 9 + /** Binary property Grapheme_Base (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. + [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ @stable ICU 2.1 */ + UCharGraphemeBase Property = 10 + /** Binary property Grapheme_Extend (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. + Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ @stable ICU 2.1 */ + UCharGraphemeExtend Property = 11 + /** Binary property Grapheme_Link (new in Unicode 3.2). + For programmatic determination of grapheme cluster boundaries. @stable ICU 2.1 */ + UCharGraphemeLink Property = 12 + /** Binary property Hex_Digit. + Characters commonly used for hexadecimal numbers. @stable ICU 2.1 */ + UCharHexDigit Property = 13 + /** Binary property Hyphen. Dashes used to mark connections + between pieces of words, plus the Katakana middle dot. @stable ICU 2.1 */ + UCharHyphen Property = 14 + /** Binary property ID_Continue. + Characters that can continue an identifier. + DerivedCoreProperties.txt also says "NOTE: Cf characters should be filtered out." + ID_Start+Mn+Mc+Nd+Pc @stable ICU 2.1 */ + UCharIDContinue Property = 15 + /** Binary property ID_Start. + Characters that can start an identifier. + Lu+Ll+Lt+Lm+Lo+Nl @stable ICU 2.1 */ + UCharIDStart Property = 16 + /** Binary property Ideographic. + CJKV ideographs. @stable ICU 2.1 */ + UCharIdeographic Property = 17 + /** Binary property IDS_Binary_Operator (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCharIdsBinaryOperator Property = 18 + /** Binary property IDS_Trinary_Operator (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCharIdsTrinaryOperator Property = 19 + /** Binary property Join_Control. + Format controls for cursive joining and ligation. @stable ICU 2.1 */ + UCharJoinControl Property = 20 + /** Binary property Logical_Order_Exception (new in Unicode 3.2). + Characters that do not use logical order and + require special handling in most processing. @stable ICU 2.1 */ + UCharLogicalOrderException Property = 21 + /** Binary property Lowercase. Same as u_isULowercase, different from u_islower. + Ll+Other_Lowercase @stable ICU 2.1 */ + UCharLowercase Property = 22 + /** Binary property Math. Sm+Other_Math @stable ICU 2.1 */ + UCharMath Property = 23 + /** Binary property Noncharacter_Code_Point. + Code points that are explicitly defined as illegal + for the encoding of characters. @stable ICU 2.1 */ + UCharNoncharacterCodePoint Property = 24 + /** Binary property Quotation_Mark. @stable ICU 2.1 */ + UCharQuotationMark Property = 25 + /** Binary property Radical (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCharRadical Property = 26 + /** Binary property Soft_Dotted (new in Unicode 3.2). + Characters with a "soft dot", like i or j. + An accent placed on these characters causes + the dot to disappear. @stable ICU 2.1 */ + UCharSoftDotted Property = 27 + /** Binary property Terminal_Punctuation. + Punctuation characters that generally mark + the end of textual units. @stable ICU 2.1 */ + UCharTerminalPunctuation Property = 28 + /** Binary property Unified_Ideograph (new in Unicode 3.2). + For programmatic determination of + Ideographic Description Sequences. @stable ICU 2.1 */ + UCharUnifiedIdeograph Property = 29 + /** Binary property Uppercase. Same as u_isUUppercase, different from u_isupper. + Lu+Other_Uppercase @stable ICU 2.1 */ + UCharUppercase Property = 30 + /** Binary property White_Space. + Same as u_isUWhiteSpace, different from u_isspace and u_isWhitespace. + Space characters+TAB+CR+LF-ZWSP-ZWNBSP @stable ICU 2.1 */ + UCharWhiteSpace Property = 31 + /** Binary property XID_Continue. + ID_Continue modified to allow closure under + normalization forms NFKC and NFKD. @stable ICU 2.1 */ + UCharXidContinue Property = 32 + /** Binary property XID_Start. ID_Start modified to allow + closure under normalization forms NFKC and NFKD. @stable ICU 2.1 */ + UCharXidStart Property = 33 + /** Binary property Case_Sensitive. Either the source of a case + mapping or _in_ the target of a case mapping. Not the same as + the general category Cased_Letter. @stable ICU 2.6 */ + UCharCaseSensitive Property = 34 + /** Binary property STerm (new in Unicode 4.0.1). + Sentence Terminal. Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + @stable ICU 3.0 */ + UCharSTerm Property = 35 + /** Binary property Variation_Selector (new in Unicode 4.0.1). + Indicates all those characters that qualify as Variation Selectors. + For details on the behavior of these characters, + see StandardizedVariants.html and 15.6 Variation Selectors. + @stable ICU 3.0 */ + UCharVariationSelector Property = 36 + /** Binary property NFD_Inert. + ICU-specific property for characters that are inert under NFD, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCharNfdInert Property = 37 + /** Binary property NFKD_Inert. + ICU-specific property for characters that are inert under NFKD, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCharNfkdInert Property = 38 + /** Binary property NFC_Inert. + ICU-specific property for characters that are inert under NFC, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCharNfcInert Property = 39 + /** Binary property NFKC_Inert. + ICU-specific property for characters that are inert under NFKC, + i.e., they do not interact with adjacent characters. + See the documentation for the Normalizer2 class and the + Normalizer2::isInert() method. + @stable ICU 3.0 */ + UCharNfkcInert Property = 40 + /** Binary Property Segment_Starter. + ICU-specific property for characters that are starters in terms of + Unicode normalization and combining character sequences. + They have ccc=0 and do not occur in non-initial position of the + canonical decomposition of any character + (like a-umlaut in NFD and a Jamo T in an NFD(Hangul LVT)). + ICU uses this property for segmenting a string for generating a set of + canonically equivalent strings, e.g. for canonical closure while + processing collation tailoring rules. + @stable ICU 3.0 */ + UCharSegmentStarter Property = 41 + /** Binary property Pattern_Syntax (new in Unicode 4.1). + See UAX #31 Identifier and Pattern Syntax + (http://www.unicode.org/reports/tr31/) + @stable ICU 3.4 */ + UCharPatternSyntax Property = 42 + /** Binary property Pattern_White_Space (new in Unicode 4.1). + See UAX #31 Identifier and Pattern Syntax + (http://www.unicode.org/reports/tr31/) + @stable ICU 3.4 */ + UCharPatternWhiteSpace Property = 43 + /** Binary property alnum (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCharPosixAlnum Property = 44 + /** Binary property blank (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCharPosixBlank Property = 45 + /** Binary property graph (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCharPosixGraph Property = 46 + /** Binary property print (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCharPosixPrint Property = 47 + /** Binary property xdigit (a C/POSIX character class). + Implemented according to the UTS #18 Annex C Standard Recommendation. + See the uchar.h file documentation. + @stable ICU 3.4 */ + UCharPosixXdigit Property = 48 + /** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */ + UCharCased Property = 49 + /** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */ + UCharCaseIgnorable Property = 50 + /** Binary property Changes_When_Lowercased. @stable ICU 4.4 */ + UCharChangesWhenLowercased Property = 51 + /** Binary property Changes_When_Uppercased. @stable ICU 4.4 */ + UCharChangesWhenUppercased Property = 52 + /** Binary property Changes_When_Titlecased. @stable ICU 4.4 */ + UCharChangesWhenTitlecased Property = 53 + /** Binary property Changes_When_Casefolded. @stable ICU 4.4 */ + UCharChangesWhenCasefolded Property = 54 + /** Binary property Changes_When_Casemapped. @stable ICU 4.4 */ + UCharChangesWhenCasemapped Property = 55 + /** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */ + UCharChangesWhenNfkcCasefolded Property = 56 + /** + * Binary property Emoji. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCharEmoji Property = 57 + /** + * Binary property Emoji_Presentation. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCharEmojiPresentation Property = 58 + /** + * Binary property Emoji_Modifier. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCharEmojiModifier Property = 59 + /** + * Binary property Emoji_Modifier_Base. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 57 + */ + UCharEmojiModifierBase Property = 60 + /** + * Binary property Emoji_Component. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 60 + */ + UCharEmojiComponent Property = 61 + /** + * Binary property Regional_Indicator. + * @stable ICU 60 + */ + UCharRegionalIndicator Property = 62 + /** + * Binary property Prepended_Concatenation_Mark. + * @stable ICU 60 + */ + UCharPrependedConcatenationMark Property = 63 + /** + * Binary property Extended_Pictographic. + * See http://www.unicode.org/reports/tr51/#Emoji_Properties + * + * @stable ICU 62 + */ + UCharExtendedPictographic Property = 64 + + /** Enumerated property Bidi_Class. + Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */ + UCharBidiClass Property = 0x1000 + /** First constant for enumerated/integer Unicode properties. @stable ICU 2.2 */ + UCharIntStart = UCharBidiClass + /** Enumerated property Block. + Same as ublock_getCode, returns UBlockCode values. @stable ICU 2.2 */ + UCharBlock Property = 0x1001 + /** Enumerated property Canonical_Combining_Class. + Same as u_getCombiningClass, returns 8-bit numeric values. @stable ICU 2.2 */ + UCharCanonicalCombiningClass Property = 0x1002 + /** Enumerated property Decomposition_Type. + Returns UDecompositionType values. @stable ICU 2.2 */ + UCharDecompositionType Property = 0x1003 + /** Enumerated property East_Asian_Width. + See http://www.unicode.org/reports/tr11/ + Returns UEastAsianWidth values. @stable ICU 2.2 */ + UCharEastAsianWidth Property = 0x1004 + /** Enumerated property General_Category. + Same as u_charType, returns UCharCategory values. @stable ICU 2.2 */ + UCharGeneralCategory Property = 0x1005 + /** Enumerated property Joining_Group. + Returns UJoiningGroup values. @stable ICU 2.2 */ + UCharJoiningGroup Property = 0x1006 + /** Enumerated property Joining_Type. + Returns UJoiningType values. @stable ICU 2.2 */ + UCharJoiningType Property = 0x1007 + /** Enumerated property Line_Break. + Returns ULineBreak values. @stable ICU 2.2 */ + UCharLineBreak Property = 0x1008 + /** Enumerated property Numeric_Type. + Returns UNumericType values. @stable ICU 2.2 */ + UCharNumericType Property = 0x1009 + /** Enumerated property Script. + Same as uscript_getScript, returns UScriptCode values. @stable ICU 2.2 */ + UCharScript Property = 0x100A + /** Enumerated property Hangul_Syllable_Type, new in Unicode 4. + Returns UHangulSyllableType values. @stable ICU 2.6 */ + UCharHangulSyllableType Property = 0x100B + /** Enumerated property NFD_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCharNfdQuickCheck Property = 0x100C + /** Enumerated property NFKD_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCharNfkdQuickCheck Property = 0x100D + /** Enumerated property NFC_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCharNfcQuickCheck Property = 0x100E + /** Enumerated property NFKC_Quick_Check. + Returns UNormalizationCheckResult values. @stable ICU 3.0 */ + UCharNfkcQuickCheck Property = 0x100F + /** Enumerated property Lead_Canonical_Combining_Class. + ICU-specific property for the ccc of the first code point + of the decomposition, or lccc(c)=ccc(NFD(c)[0]). + Useful for checking for canonically ordered text; + see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . + Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ + UCharLeadCanonicalCombiningClass Property = 0x1010 + /** Enumerated property Trail_Canonical_Combining_Class. + ICU-specific property for the ccc of the last code point + of the decomposition, or tccc(c)=ccc(NFD(c)[last]). + Useful for checking for canonically ordered text; + see UNORM_FCD and http://www.unicode.org/notes/tn5/#FCD . + Returns 8-bit numeric values like UCHAR_CANONICAL_COMBINING_CLASS. @stable ICU 3.0 */ + UCharTrailCanonicalCombiningClass Property = 0x1011 + /** Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns UGraphemeClusterBreak values. @stable ICU 3.4 */ + UCharGraphemeClusterBreak Property = 0x1012 + /** Enumerated property Sentence_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns USentenceBreak values. @stable ICU 3.4 */ + UCharSentenceBreak Property = 0x1013 + /** Enumerated property Word_Break (new in Unicode 4.1). + Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + Returns UWordBreakValues values. @stable ICU 3.4 */ + UCharWordBreak Property = 0x1014 + /** Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + Used in UAX #9: Unicode Bidirectional Algorithm + (http://www.unicode.org/reports/tr9/) + Returns UBidiPairedBracketType values. @stable ICU 52 */ + UCharBidiPairedBracketType Property = 0x1015 + /** + * Enumerated property Indic_Positional_Category. + * New in Unicode 6.0 as provisional property Indic_Matra_Category; + * renamed and changed to informative in Unicode 8.0. + * See http://www.unicode.org/reports/tr44/#IndicPositionalCategory.txt + * @stable ICU 63 + */ + UCharIndicPositionalCategory Property = 0x1016 + /** + * Enumerated property Indic_Syllabic_Category. + * New in Unicode 6.0 as provisional; informative since Unicode 8.0. + * See http://www.unicode.org/reports/tr44/#IndicSyllabicCategory.txt + * @stable ICU 63 + */ + UCharIndicSyllableCategory Property = 0x1017 + /** + * Enumerated property Vertical_Orientation. + * Used for UAX #50 Unicode Vertical Text Layout (https://www.unicode.org/reports/tr50/). + * New as a UCD property in Unicode 10.0. + * @stable ICU 63 + */ + UCharVerticalOrientation Property = 0x1018 + + /** Bitmask property General_Category_Mask. + This is the General_Category property returned as a bit mask. + When used in u_getIntPropertyValue(c), same as U_MASK(u_charType(c)), + returns bit masks for UCharCategory values where exactly one bit is set. + When used with u_getPropertyValueName() and u_getPropertyValueEnum(), + a multi-bit mask is used for sets of categories like "Letters". + Mask values should be cast to uint32_t. + @stable ICU 2.4 */ + UCharGeneralCategoryMask Property = 0x2000 + /** First constant for bit-mask Unicode properties. @stable ICU 2.4 */ + UCharMaskStart = UCharGeneralCategoryMask + /** Double property Numeric_Value. + Corresponds to u_getNumericValue. @stable ICU 2.4 */ + UCharNumericValue Property = 0x3000 + /** First constant for double Unicode properties. @stable ICU 2.4 */ + UCharDoubleStart = UCharNumericValue + /** String property Age. + Corresponds to u_charAge. @stable ICU 2.4 */ + UCharAge Property = 0x4000 + /** First constant for string Unicode properties. @stable ICU 2.4 */ + UCharStringStart = UCharAge + /** String property Bidi_Mirroring_Glyph. + Corresponds to u_charMirror. @stable ICU 2.4 */ + UCharBidiMirroringGlyph Property = 0x4001 + /** String property Case_Folding. + Corresponds to u_strFoldCase in ustring.h. @stable ICU 2.4 */ + UCharCaseFolding Property = 0x4002 + /** String property Lowercase_Mapping. + Corresponds to u_strToLower in ustring.h. @stable ICU 2.4 */ + UCharLowercaseMapping Property = 0x4004 + /** String property Name. + Corresponds to u_charName. @stable ICU 2.4 */ + UCharName Property = 0x4005 + /** String property Simple_Case_Folding. + Corresponds to u_foldCase. @stable ICU 2.4 */ + UCharSimpleCaseFolding Property = 0x4006 + /** String property Simple_Lowercase_Mapping. + Corresponds to u_tolower. @stable ICU 2.4 */ + UCharSimpleLowercaseMapping Property = 0x4007 + /** String property Simple_Titlecase_Mapping. + Corresponds to u_totitle. @stable ICU 2.4 */ + UcharSimpleTitlecaseMapping Property = 0x4008 + /** String property Simple_Uppercase_Mapping. + Corresponds to u_toupper. @stable ICU 2.4 */ + UCharSimpleUppercaseMapping Property = 0x4009 + /** String property Titlecase_Mapping. + Corresponds to u_strToTitle in ustring.h. @stable ICU 2.4 */ + UCharTitlecaseMapping Property = 0x400A + /** String property Uppercase_Mapping. + Corresponds to u_strToUpper in ustring.h. @stable ICU 2.4 */ + UCharUppercaseMapping Property = 0x400C + /** String property Bidi_Paired_Bracket (new in Unicode 6.3). + Corresponds to u_getBidiPairedBracket. @stable ICU 52 */ + UCharBidiPairedBracket Property = 0x400D + + /** Miscellaneous property Script_Extensions (new in Unicode 6.0). + Some characters are commonly used in multiple scripts. + For more information, see UAX #24: http://www.unicode.org/reports/tr24/. + Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h. + @stable ICU 4.6 */ + UCharScriptExtensions Property = 0x7000 + /** First constant for Unicode properties with unusual value types. @stable ICU 4.6 */ + UCharOtherPropertyStart = UCharScriptExtensions + + /** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */ + UCharInvalidCode Property = -1 +) + +const ( + uCharBinaryLimit = 65 + uCharIntLimit = 0x1019 + uCharMaskLimit = 0x2001 + uCharStringLimit = 0x400E +) + +/* + * Properties in vector word 1 + * Each bit encodes one binary property. + * The following constants represent the bit number, use 1<= 0 { + set.AddRuneRange(startHasProperty, c-1) + startHasProperty = -1 + } + } + } + if startHasProperty >= 0 { + set.AddRuneRange(startHasProperty, uset.MaxValue) + } + + inclusionsForProperty[prop] = set + return set, nil +} + +func getInclusionsForIntProperty(prop Property) (*uset.UnicodeSet, error) { + if inc, ok := inclusionsForProperty[prop]; ok { + return inc, nil + } + + src := prop.source() + incl, err := getInclusionsForSource(src) + if err != nil { + return nil, err + } + + intPropIncl := uset.New() + intPropIncl.AddRune(0) + + numRanges := incl.RangeCount() + prevValue := int32(0) + + for i := 0; i < numRanges; i++ { + rangeEnd := incl.RangeEnd(i) + for c := incl.RangeStart(i); c <= rangeEnd; c++ { + value := getIntPropertyValue(c, prop) + if value != prevValue { + intPropIncl.AddRune(c) + prevValue = value + } + } + } + + inclusionsForProperty[prop] = intPropIncl + return intPropIncl, nil +} + +func ApplyIntPropertyValue(u *uset.UnicodeSet, prop Property, value int32) error { + switch { + case prop == UCharGeneralCategoryMask: + inclusions, err := getInclusionsForProperty(prop) + if err != nil { + return err + } + u.ApplyFilter(inclusions, func(ch rune) bool { + return (uMask(uchar.CharType(ch)) & uint32(value)) != 0 + }) + case prop == UCharScriptExtensions: + inclusions, err := getInclusionsForProperty(prop) + if err != nil { + return err + } + u.ApplyFilter(inclusions, func(ch rune) bool { + return uscriptHasScript(ch, code(value)) + }) + case 0 <= prop && prop < uCharBinaryLimit: + if value == 0 || value == 1 { + set, err := getInclusionsForBinaryProperty(prop) + if err != nil { + return err + } + u.CopyFrom(set) + if value == 0 { + u.Complement() + } + } else { + u.Clear() + } + + case UCharIntStart <= prop && prop < uCharIntLimit: + inclusions, err := getInclusionsForProperty(prop) + if err != nil { + return err + } + u.ApplyFilter(inclusions, func(ch rune) bool { + return getIntPropertyValue(ch, prop) == value + }) + default: + return errors.ErrUnsupported + } + return nil +} + +func mungeCharName(charname string) string { + out := make([]byte, 0, len(charname)) + for _, ch := range []byte(charname) { + j := len(out) + if ch == ' ' && (j == 0 || out[j-1] == ' ') { + continue + } + out = append(out, ch) + } + return string(out) +} + +func ApplyPropertyPattern(u *uset.UnicodeSet, pat string) error { + if len(pat) < 5 { + return errors.ErrIllegalArgument + } + + var posix, isName, invert bool + + if isPOSIXOpen(pat) { + posix = true + pat = pattern.SkipWhitespace(pat[2:]) + if len(pat) > 0 && pat[0] == '^' { + pat = pat[1:] + invert = true + } + } else if isPerlOpen(pat) || isNameOpen(pat) { + c := pat[1] + invert = c == 'P' + isName = c == 'N' + pat = pattern.SkipWhitespace(pat[2:]) + if len(pat) == 0 || pat[0] != '{' { + return errors.ErrIllegalArgument + } + pat = pat[1:] + } else { + return errors.ErrIllegalArgument + } + + var closePos int + if posix { + closePos = strings.Index(pat, ":]") + } else { + closePos = strings.IndexByte(pat, '}') + } + if closePos < 0 { + return errors.ErrIllegalArgument + } + + equals := strings.IndexByte(pat, '=') + var propName, valueName string + if equals >= 0 && equals < closePos && !isName { + propName = pat[:equals] + valueName = pat[equals+1 : closePos] + } else { + propName = pat[:closePos] + if isName { + valueName = propName + propName = "na" + } + } + + if err := ApplyPropertyAlias(u, propName, valueName); err != nil { + return err + } + if invert { + u.Complement() + } + return nil +} + +func isPOSIXOpen(pattern string) bool { + return pattern[0] == '[' && pattern[1] == ':' +} + +func isNameOpen(pattern string) bool { + return pattern[0] == '\\' && pattern[1] == 'N' +} + +func isPerlOpen(pattern string) bool { + return pattern[0] == '\\' && (pattern[1] == 'p' || pattern[1] == 'P') +} + +func ApplyPropertyAlias(u *uset.UnicodeSet, prop, value string) error { + var p Property + var v int32 + var invert bool + + if len(value) > 0 { + p = getPropertyEnum(prop) + if p == -1 { + return errors.ErrIllegalArgument + } + if p == UCharGeneralCategory { + p = UCharGeneralCategoryMask + } + + if (p >= UCharBinaryStart && p < uCharBinaryLimit) || + (p >= UCharIntStart && p < uCharIntLimit) || + (p >= UCharMaskStart && p < uCharMaskLimit) { + v = getPropertyValueEnum(p, value) + if v == -1 { + // Handle numeric CCC + if p == UCharCanonicalCombiningClass || + p == UCharTrailCanonicalCombiningClass || + p == UCharLeadCanonicalCombiningClass { + val, err := strconv.ParseUint(value, 10, 8) + if err != nil { + return errors.ErrIllegalArgument + } + v = int32(val) + } else { + return errors.ErrIllegalArgument + } + } + } else { + switch p { + case UCharNumericValue: + val, err := strconv.ParseFloat(value, 64) + if err != nil { + return errors.ErrIllegalArgument + } + incl, err := getInclusionsForProperty(p) + if err != nil { + return err + } + u.ApplyFilter(incl, func(ch rune) bool { + return uchar.NumericValue(ch) == val + }) + return nil + case UCharName: + // Must munge name, since u_charFromName() does not do + // 'loose' matching. + charName := mungeCharName(value) + ch := unames.CharForName(unames.ExtendedCharName, charName) + if ch < 0 { + return errors.ErrIllegalArgument + } + u.Clear() + u.AddRune(ch) + return nil + case UCharAge: + // Must munge name, since u_versionFromString() does not do + // 'loose' matching. + charName := mungeCharName(value) + version := uchar.VersionFromString(charName) + incl, err := getInclusionsForProperty(p) + if err != nil { + return err + } + u.ApplyFilter(incl, func(ch rune) bool { + return uchar.CharAge(ch) == version + }) + return nil + case UCharScriptExtensions: + v = getPropertyValueEnum(UCharScript, value) + if v == -1 { + return errors.ErrIllegalArgument + } + default: + // p is a non-binary, non-enumerated property that we + // don't support (yet). + return errors.ErrIllegalArgument + } + } + } else { + // value is empty. Interpret as General Category, Script, or + // Binary property. + p = UCharGeneralCategoryMask + v = getPropertyValueEnum(p, prop) + if v == -1 { + p = UCharScript + v = getPropertyValueEnum(p, prop) + if v == -1 { + p = getPropertyEnum(prop) + if p >= UCharBinaryStart && p < uCharBinaryLimit { + v = 1 + } else if 0 == comparePropertyNames("ANY", prop) { + u.Clear() + u.AddRuneRange(uset.MinValue, uset.MaxValue) + return nil + } else if 0 == comparePropertyNames("ASCII", prop) { + u.Clear() + u.AddRuneRange(0, 0x7F) + return nil + } else if 0 == comparePropertyNames("Assigned", prop) { + // [:Assigned:]=[:^Cn:] + p = UCharGeneralCategoryMask + v = int32(uchar.GcCnMask) + invert = true + } else { + return errors.ErrIllegalArgument + } + } + } + } + + err := ApplyIntPropertyValue(u, p, v) + if err != nil { + return err + } + if invert { + u.Complement() + } + return nil +} + +func AddULayoutPropertyStarts(src propertySource, u *uset.UnicodeSet) { + var trie *utrie.UcpTrie + switch src { + case srcInpc: + trie = ulayout.InpcTrie() + case srcInsc: + trie = ulayout.InscTrie() + case srcVo: + trie = ulayout.VoTrie() + default: + panic("unreachable") + } + + // Add the start code point of each same-value range of the trie. + var start, end rune + for { + end, _ = trie.GetRange(start, utrie.UcpMapRangeNormal, 0, nil) + if end < 0 { + break + } + u.AddRune(start) + start = end + 1 + } +} + +func AddCategory(u *uset.UnicodeSet, mask uint32) error { + set := uset.New() + err := ApplyIntPropertyValue(set, UCharGeneralCategoryMask, int32(mask)) + if err != nil { + return err + } + u.AddAll(set) + return nil +} + +func NewUnicodeSetFomPattern(pattern string, flags uset.USet) (*uset.UnicodeSet, error) { + u := uset.New() + if err := ApplyPropertyPattern(u, pattern); err != nil { + return nil, err + } + if flags&uset.CaseInsensitive != 0 { + u.CloseOver(uset.CaseInsensitive) + } + return u, nil +} + +func MustNewUnicodeSetFomPattern(pattern string, flags uset.USet) *uset.UnicodeSet { + u, err := NewUnicodeSetFomPattern(pattern, flags) + if err != nil { + panic(err) + } + return u +} diff --git a/go/mysql/icuregex/internal/uprops/uprops.go b/go/mysql/icuregex/internal/uprops/uprops.go new file mode 100644 index 00000000000..ddf0989b5d8 --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uprops.go @@ -0,0 +1,269 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/bytestrie" + "vitess.io/vitess/go/mysql/icuregex/internal/icudata" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +var pnames struct { + valueMaps []uint32 + byteTrie []uint8 +} + +const ( + ixValueMapsOffset = 0 + ixByteTriesOffset = 1 + ixNameGroupsOffset = 2 + ixReserved3Offset = 3 +) + +func readData(bytes *udata.Bytes) error { + err := bytes.ReadHeader(func(info *udata.DataInfo) bool { + return info.DataFormat[0] == 0x70 && + info.DataFormat[1] == 0x6e && + info.DataFormat[2] == 0x61 && + info.DataFormat[3] == 0x6d && + info.FormatVersion[0] == 2 + }) + if err != nil { + return err + } + + count := bytes.Int32() / 4 + if count < 8 { + return fmt.Errorf("indexes[0] too small in ucase.icu") + } + + indexes := make([]int32, count) + indexes[0] = count * 4 + + for i := int32(1); i < count; i++ { + indexes[i] = bytes.Int32() + } + + offset := indexes[ixValueMapsOffset] + nextOffset := indexes[ixByteTriesOffset] + numInts := (nextOffset - offset) / 4 + + pnames.valueMaps = bytes.Uint32Slice(numInts) + + offset = nextOffset + nextOffset = indexes[ixNameGroupsOffset] + numBytes := nextOffset - offset + + pnames.byteTrie = bytes.Uint8Slice(numBytes) + return nil +} + +func init() { + b := udata.NewBytes(icudata.PNames) + if err := readData(b); err != nil { + panic(err) + } +} + +func (prop Property) source() propertySource { + if prop < UCharBinaryStart { + return srcNone /* undefined */ + } else if prop < uCharBinaryLimit { + bprop := binProps[prop] + if bprop.mask != 0 { + return srcPropsvec + } + return bprop.column + } else if prop < UCharIntStart { + return srcNone /* undefined */ + } else if prop < uCharIntLimit { + iprop := intProps[prop-UCharIntStart] + if iprop.mask != 0 { + return srcPropsvec + } + return iprop.column + } else if prop < UCharStringStart { + switch prop { + case UCharGeneralCategoryMask, + UCharNumericValue: + return srcChar + + default: + return srcNone + } + } else if prop < uCharStringLimit { + switch prop { + case UCharAge: + return srcPropsvec + + case UCharBidiMirroringGlyph: + return srcBidi + + case UCharCaseFolding, + UCharLowercaseMapping, + UCharSimpleCaseFolding, + UCharSimpleLowercaseMapping, + UcharSimpleTitlecaseMapping, + UCharSimpleUppercaseMapping, + UCharTitlecaseMapping, + UCharUppercaseMapping: + return srcCase + + /* UCHAR_ISO_COMMENT, UCHAR_UNICODE_1_NAME (deprecated) */ + case UCharName: + return srcNames + + default: + return srcNone + } + } else { + switch prop { + case UCharScriptExtensions: + return srcPropsvec + default: + return srcNone /* undefined */ + } + } +} + +func getPropertyEnum(alias string) Property { + return Property(getPropertyOrValueEnum(0, alias)) +} + +func getPropertyValueEnum(prop Property, alias string) int32 { + valueMapIdx := findProperty(prop) + if valueMapIdx == 0 { + return -1 + } + + valueMapIdx = int32(pnames.valueMaps[valueMapIdx+1]) + if valueMapIdx == 0 { + return -1 + } + // valueMapIndex is the start of the property's valueMap, + // where the first word is the BytesTrie offset. + return getPropertyOrValueEnum(int32(pnames.valueMaps[valueMapIdx]), alias) +} + +func findProperty(prop Property) int32 { + var i = int32(1) + for numRanges := int32(pnames.valueMaps[0]); numRanges > 0; numRanges-- { + start := int32(pnames.valueMaps[i]) + limit := int32(pnames.valueMaps[i+1]) + i += 2 + if int32(prop) < start { + break + } + if int32(prop) < limit { + return i + (int32(prop)-start)*2 + } + i += (limit - start) * 2 + } + return 0 +} + +func getPropertyOrValueEnum(offset int32, alias string) int32 { + trie := bytestrie.New(pnames.byteTrie[offset:]) + if trie.ContainsName(alias) { + return trie.GetValue() + } + return -1 +} + +func comparePropertyNames(name1, name2 string) int { + next := func(s string) (byte, string) { + for len(s) > 0 && (s[0] == 0x2d || s[0] == 0x5f || s[0] == 0x20 || (0x09 <= s[0] && s[0] <= 0x0d)) { + s = s[1:] + } + if len(s) == 0 { + return 0, "" + } + c := s[0] + s = s[1:] + if 'A' <= c && c <= 'Z' { + c += 'a' - 'A' + } + return c, s + } + + var r1, r2 byte + for { + r1, name1 = next(name1) + r2, name2 = next(name2) + + if r1 == 0 && r2 == 0 { + return 0 + } + + /* Compare the lowercased characters */ + if r1 != r2 { + return int(r1) - int(r2) + } + } +} + +func getIntPropertyValue(c rune, which Property) int32 { + if which < UCharIntStart { + if UCharBinaryStart <= which && which < uCharBinaryLimit { + prop := binProps[which] + if prop.contains == nil { + return 0 + } + if prop.contains(prop, c, which) { + return 1 + } + return 0 + } + } else if which < uCharIntLimit { + iprop := intProps[which-UCharIntStart] + return iprop.getValue(iprop, c, which) + } else if which == UCharGeneralCategoryMask { + return int32(uMask(uchar.CharType(c))) + } + return 0 // undefined +} + +func mergeScriptCodeOrIndex(scriptX uint32) uint32 { + return ((scriptX & scriptHighMask) >> scriptHighShift) | + (scriptX & scriptLowMask) +} + +func script(c rune) int32 { + if c > 0x10ffff { + return -1 + } + scriptX := uchar.GetUnicodeProperties(c, 0) & scriptXMask + codeOrIndex := mergeScriptCodeOrIndex(scriptX) + + if scriptX < scriptXWithCommon { + return int32(codeOrIndex) + } else if scriptX < scriptXWithInherited { + return 0 + } else if scriptX < scriptXWithOther { + return 1 + } else { + return int32(uchar.ScriptExtension(codeOrIndex)) + } +} diff --git a/go/mysql/icuregex/internal/uprops/uprops_binary.go b/go/mysql/icuregex/internal/uprops/uprops_binary.go new file mode 100644 index 00000000000..855da92b3b6 --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uprops_binary.go @@ -0,0 +1,239 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "golang.org/x/exp/constraints" + "golang.org/x/exp/slices" + + "vitess.io/vitess/go/mysql/icuregex/internal/normalizer" + "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" +) + +type binaryProperty struct { + column propertySource + mask uint32 + contains func(prop *binaryProperty, c rune, which Property) bool +} + +func uMask[T constraints.Integer](x T) uint32 { + return 1 << x +} + +func defaultContains(prop *binaryProperty, c rune, _ Property) bool { + return (uchar.GetUnicodeProperties(c, int(prop.column)) & prop.mask) != 0 +} + +var binProps = [uCharBinaryLimit]*binaryProperty{ + /* + * column and mask values for binary properties from u_getUnicodeProperties(). + * Must be in order of corresponding UProperty, + * and there must be exactly one entry per binary UProperty. + * + * Properties with mask==0 are handled in code. + * For them, column is the UPropertySource value. + * + * See also https://unicode-org.github.io/icu/userguide/strings/properties.html + */ + {1, uMask(pAlphabetic), defaultContains}, + {1, uMask(pASCIIHexDigit), defaultContains}, + {srcBidi, 0, isBidiControl}, + {srcBidi, 0, isMirrored}, + {1, uMask(pDash), defaultContains}, + {1, uMask(pDefaultIgnorableCodePoint), defaultContains}, + {1, uMask(pDeprecated), defaultContains}, + {1, uMask(pDiacritic), defaultContains}, + {1, uMask(pExtender), defaultContains}, + {srcNfc, 0, hasFullCompositionExclusion}, + {1, uMask(pGraphemeBase), defaultContains}, + {1, uMask(pGraphemeExtend), defaultContains}, + {1, uMask(pGraphemeLink), defaultContains}, + {1, uMask(pHexDigit), defaultContains}, + {1, uMask(pHyphen), defaultContains}, + {1, uMask(pIDContinue), defaultContains}, + {1, uMask(pIDStart), defaultContains}, + {1, uMask(pIdeographic), defaultContains}, + {1, uMask(pIdsBinaryOperator), defaultContains}, + {1, uMask(pIdsTrinaryOperator), defaultContains}, + {srcBidi, 0, isJoinControl}, + {1, uMask(pLogicalOrderException), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_LOWERCASE + {1, uMask(pMath), defaultContains}, + {1, uMask(pNoncharacterCodePoint), defaultContains}, + {1, uMask(pQuotationMark), defaultContains}, + {1, uMask(pRadical), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_SOFT_DOTTED + {1, uMask(pTerminalPunctuation), defaultContains}, + {1, uMask(pUnifiedIdeograph), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_UPPERCASE + {1, uMask(pWhiteSpace), defaultContains}, + {1, uMask(pXidContinue), defaultContains}, + {1, uMask(pXidStart), defaultContains}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CASE_SENSITIVE + {1, uMask(pSTerm), defaultContains}, + {1, uMask(pVariationSelector), defaultContains}, + {srcNfc, 0, isNormInert}, // UCHAR_NFD_INERT + {srcNfkc, 0, isNormInert}, // UCHAR_NFKD_INERT + {srcNfc, 0, isNormInert}, // UCHAR_NFC_INERT + {srcNfkc, 0, isNormInert}, // UCHAR_NFKC_INERT + {srcNfcCanonIter, 0, nil}, // Segment_Starter is currently unsupported + {1, uMask(pPatternSyntax), defaultContains}, + {1, uMask(pPatternWhiteSpace), defaultContains}, + {srcCharAndPropsvec, 0, isPOSIXAlnum}, + {srcChar, 0, isPOSIXBlank}, + {srcChar, 0, isPOSIXGraph}, + {srcChar, 0, isPOSIXPrint}, + {srcChar, 0, isPOSIXXdigit}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CASED + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CASE_IGNORABLE + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_LOWERCASED + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_UPPERCASED + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_TITLECASED + {srcCaseAndNorm, 0, changesWhenCasefolded}, + {srcCase, 0, caseBinaryPropertyContains}, // UCHAR_CHANGES_WHEN_CASEMAPPED + {srcNfkcCf, 0, nil}, // Changes_When_NFKC_Casefolded is currently unsupported + {2, uMask(p2Emoji), defaultContains}, + {2, uMask(p2EmojiPresentation), defaultContains}, + {2, uMask(p2EmojiModifier), defaultContains}, + {2, uMask(p2EmojiModifierBase), defaultContains}, + {2, uMask(p2EmojiComponent), defaultContains}, + {2, 0, isRegionalIndicator}, + {1, uMask(pPrependedConcatenationMark), defaultContains}, + {2, uMask(p2ExtendedPictographic), defaultContains}, +} + +func isBidiControl(_ *binaryProperty, c rune, _ Property) bool { + return ubidi.IsBidiControl(c) +} + +func isMirrored(_ *binaryProperty, c rune, _ Property) bool { + return ubidi.IsMirrored(c) +} + +func isRegionalIndicator(_ *binaryProperty, c rune, _ Property) bool { + return 0x1F1E6 <= c && c <= 0x1F1FF +} + +func changesWhenCasefolded(_ *binaryProperty, c rune, _ Property) bool { + if c < 0 { + return false + } + + nfd := normalizer.Nfc().Decompose(c) + if nfd == nil { + nfd = []rune{c} + } + folded := ucase.FoldRunes(nfd) + return !slices.Equal(nfd, folded) +} + +func isPOSIXXdigit(_ *binaryProperty, c rune, _ Property) bool { + return uchar.IsXDigit(c) +} + +func isPOSIXPrint(_ *binaryProperty, c rune, _ Property) bool { + return uchar.IsPOSIXPrint(c) +} + +func isPOSIXGraph(_ *binaryProperty, c rune, _ Property) bool { + return uchar.IsGraphPOSIX(c) +} + +func isPOSIXBlank(_ *binaryProperty, c rune, _ Property) bool { + return uchar.IsBlank(c) +} + +func isPOSIXAlnum(_ *binaryProperty, c rune, _ Property) bool { + return (uchar.GetUnicodeProperties(c, 1)&uMask(pAlphabetic)) != 0 || uchar.IsDigit(c) +} + +func isJoinControl(_ *binaryProperty, c rune, _ Property) bool { + return ubidi.IsJoinControl(c) +} + +func hasFullCompositionExclusion(_ *binaryProperty, c rune, _ Property) bool { + impl := normalizer.Nfc() + return impl.IsCompNo(c) +} + +func caseBinaryPropertyContains(_ *binaryProperty, c rune, which Property) bool { + return HasBinaryPropertyUcase(c, which) +} + +func HasBinaryPropertyUcase(c rune, which Property) bool { + /* case mapping properties */ + switch which { + case UCharLowercase: + return ucase.Lower == ucase.GetType(c) + case UCharUppercase: + return ucase.Upper == ucase.GetType(c) + case UCharSoftDotted: + return ucase.IsSoftDotted(c) + case UCharCaseSensitive: + return ucase.IsCaseSensitive(c) + case UCharCased: + return ucase.None != ucase.GetType(c) + case UCharCaseIgnorable: + return (ucase.GetTypeOrIgnorable(c) >> 2) != 0 + /* + * Note: The following Changes_When_Xyz are defined as testing whether + * the NFD form of the input changes when Xyz-case-mapped. + * However, this simpler implementation of these properties, + * ignoring NFD, passes the tests. + * The implementation needs to be changed if the tests start failing. + * When that happens, optimizations should be used to work with the + * per-single-code point ucase_toFullXyz() functions unless + * the NFD form has more than one code point, + * and the property starts set needs to be the union of the + * start sets for normalization and case mappings. + */ + case UCharChangesWhenLowercased: + return ucase.ToFullLower(c) >= 0 + case UCharChangesWhenUppercased: + return ucase.ToFullUpper(c) >= 0 + case UCharChangesWhenTitlecased: + return ucase.ToFullTitle(c) >= 0 + /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ + case UCharChangesWhenCasemapped: + return ucase.ToFullLower(c) >= 0 || ucase.ToFullUpper(c) >= 0 || ucase.ToFullTitle(c) >= 0 + default: + return false + } +} + +func isNormInert(_ *binaryProperty, c rune, which Property) bool { + mode := normalizer.Mode(int32(which) - int32(UCharNfdInert) + int32(normalizer.NormNfd)) + return normalizer.IsInert(c, mode) +} + +func HasBinaryProperty(c rune, which Property) bool { + if which < UCharBinaryStart || uCharBinaryLimit <= which { + return false + } + prop := binProps[which] + if prop.contains == nil { + return false + } + return prop.contains(prop, c, which) +} diff --git a/go/mysql/icuregex/internal/uprops/uprops_int.go b/go/mysql/icuregex/internal/uprops/uprops_int.go new file mode 100644 index 00000000000..3e62d31184f --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uprops_int.go @@ -0,0 +1,265 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/normalizer" + "vitess.io/vitess/go/mysql/icuregex/internal/ubidi" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/ulayout" +) + +type intPropertyGetValue func(prop *intProperty, c rune, which Property) int32 + +type intProperty struct { + column propertySource + mask uint32 + shift int32 + getValue intPropertyGetValue +} + +const ( + blockMask = 0x0001ff00 + blockShift = 8 + + eaMask = 0x000e0000 + eaShift = 17 + + lbMask = 0x03f00000 + lbShift = 20 + + sbMask = 0x000f8000 + sbShift = 15 + + wbMask = 0x00007c00 + wbShift = 10 + + gcbMask = 0x000003e0 + gcbShift = 5 + + dtMask = 0x0000001f +) + +type numericType int32 + +/** + * Numeric Type constants. + * + * @see UCHAR_NUMERIC_TYPE + * @stable ICU 2.2 + */ +const ( + /* + * Note: UNumericType constants are parsed by preparseucd.py. + * It matches lines like + * U_NT_ + */ + + ntNone numericType = iota /*[None]*/ + ntDecimal /*[de]*/ + ntDigit /*[di]*/ + ntNumeric /*[nu]*/ + /** + * One more than the highest normal UNumericType value. + * The highest value is available via u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE). + * + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ + ntCount +) + +/** + * Hangul Syllable Type constants. + * + * @see UCHAR_HANGUL_SYLLABLE_TYPE + * @stable ICU 2.6 + */ + +type hangunSyllableType int32 + +const ( + /* + * Note: UHangulSyllableType constants are parsed by preparseucd.py. + * It matches lines like + * U_HST_ + */ + + hstNotApplicable hangunSyllableType = iota /*[NA]*/ + hstLeadingJamo /*[L]*/ + hstVowelJamo /*[V]*/ + hstTrailingJamo /*[T]*/ + hstLvSyllable /*[LV]*/ + hstLvtSyllable /*[LVT]*/ + /** + * One more than the highest normal UHangulSyllableType value. + * The highest value is available via u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE). + * + * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. + */ + hstCount +) + +var intProps = [uCharIntLimit - UCharIntStart]*intProperty{ + /* + * column, mask and shift values for int-value properties from u_getUnicodeProperties(). + * Must be in order of corresponding UProperty, + * and there must be exactly one entry per int UProperty. + * + * Properties with mask==0 are handled in code. + * For them, column is the UPropertySource value. + */ + {srcBidi, 0, 0, getBiDiClass}, + {0, blockMask, blockShift, defaultGetValue}, + {srcNfc, 0, 0xff, getCombiningClass}, + {2, dtMask, 0, defaultGetValue}, + {0, eaMask, eaShift, defaultGetValue}, + {srcChar, 0, int32(uchar.CharCategoryCount - 1), getGeneralCategory}, + {srcBidi, 0, 0, getJoiningGroup}, + {srcBidi, 0, 0, getJoiningType}, + {2, lbMask, lbShift, defaultGetValue}, + {srcChar, 0, int32(ntCount - 1), getNumericType}, + {srcPropsvec, 0, 0, getScript}, + {srcPropsvec, 0, int32(hstCount - 1), getHangulSyllableType}, + // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" + {srcNfc, 0, int32(normalizer.Yes), getNormQuickCheck}, + // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" + {srcNfkc, 0, int32(normalizer.Yes), getNormQuickCheck}, + // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE + {srcNfc, 0, int32(normalizer.Maybe), getNormQuickCheck}, + // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE + {srcNfkc, 0, int32(normalizer.Maybe), getNormQuickCheck}, + {srcNfc, 0, 0xff, getLeadCombiningClass}, + {srcNfc, 0, 0xff, getTrailCombiningClass}, + {2, gcbMask, gcbShift, defaultGetValue}, + {2, sbMask, sbShift, defaultGetValue}, + {2, wbMask, wbShift, defaultGetValue}, + {srcBidi, 0, 0, getBiDiPairedBracketType}, + {srcInpc, 0, 0, getInPC}, + {srcInsc, 0, 0, getInSC}, + {srcVo, 0, 0, getVo}, +} + +func getVo(_ *intProperty, c rune, _ Property) int32 { + return int32(ulayout.VoTrie().Get(c)) +} + +func getInSC(_ *intProperty, c rune, _ Property) int32 { + return int32(ulayout.InscTrie().Get(c)) +} + +func getInPC(_ *intProperty, c rune, _ Property) int32 { + return int32(ulayout.InpcTrie().Get(c)) +} + +func getBiDiPairedBracketType(_ *intProperty, c rune, _ Property) int32 { + return int32(ubidi.PairedBracketType(c)) +} + +func getTrailCombiningClass(_ *intProperty, c rune, _ Property) int32 { + return int32(normalizer.Nfc().GetFCD16(c) & 0xff) +} + +func getLeadCombiningClass(_ *intProperty, c rune, _ Property) int32 { + val := int32(normalizer.Nfc().GetFCD16(c) >> 8) + return val +} + +func getNormQuickCheck(_ *intProperty, c rune, which Property) int32 { + return int32(normalizer.QuickCheck(c, normalizer.Mode(int32(which)-int32(UCharNfdQuickCheck)+int32(normalizer.NormNfd)))) +} + +/* + * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. + * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. + */ +var gcbToHst = []hangunSyllableType{ + hstNotApplicable, /* U_GCB_OTHER */ + hstNotApplicable, /* U_GCB_CONTROL */ + hstNotApplicable, /* U_GCB_CR */ + hstNotApplicable, /* U_GCB_EXTEND */ + hstLeadingJamo, /* U_GCB_L */ + hstNotApplicable, /* U_GCB_LF */ + hstLvSyllable, /* U_GCB_LV */ + hstLvtSyllable, /* U_GCB_LVT */ + hstTrailingJamo, /* U_GCB_T */ + hstVowelJamo, /* U_GCB_V */ + /* + * Omit GCB values beyond what we need for hst. + * The code below checks for the array length. + */ +} + +func getHangulSyllableType(_ *intProperty, c rune, _ Property) int32 { + /* see comments on gcbToHst[] above */ + gcb := (int32(uchar.GetUnicodeProperties(c, 2)) & gcbMask) >> gcbShift + + if gcb < int32(len(gcbToHst)) { + return int32(gcbToHst[gcb]) + } + return int32(hstNotApplicable) +} + +func getScript(_ *intProperty, c rune, _ Property) int32 { + return script(c) +} + +func getNumericType(_ *intProperty, c rune, _ Property) int32 { + ntv := uchar.NumericTypeValue(c) + return int32(ntvGetType(ntv)) +} + +func getJoiningType(_ *intProperty, c rune, _ Property) int32 { + return int32(ubidi.JoinType(c)) +} + +func getJoiningGroup(_ *intProperty, c rune, _ Property) int32 { + return int32(ubidi.JoinGroup(c)) +} + +func getGeneralCategory(_ *intProperty, c rune, _ Property) int32 { + return int32(uchar.CharType(c)) +} + +func getCombiningClass(_ *intProperty, c rune, _ Property) int32 { + return int32(normalizer.Nfc().CombiningClass(c)) +} + +func defaultGetValue(prop *intProperty, c rune, _ Property) int32 { + return int32(uchar.GetUnicodeProperties(c, int(prop.column))&prop.mask) >> prop.shift +} + +func getBiDiClass(_ *intProperty, c rune, _ Property) int32 { + return int32(ubidi.Class(c)) +} + +func ntvGetType(ntv uint16) numericType { + switch { + case ntv == uchar.UPropsNtvNone: + return ntNone + case ntv < uchar.UPropsNtvDigitStart: + return ntDecimal + case ntv < uchar.UPropsNtvNumericStart: + return ntDigit + default: + return ntNumeric + } +} diff --git a/go/mysql/icuregex/internal/uprops/uscript.go b/go/mysql/icuregex/internal/uprops/uscript.go new file mode 100644 index 00000000000..8a4423849df --- /dev/null +++ b/go/mysql/icuregex/internal/uprops/uscript.go @@ -0,0 +1,505 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uprops + +import "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + +/** + * Constants for ISO 15924 script codes. + * + * The current set of script code constants supports at least all scripts + * that are encoded in the version of Unicode which ICU currently supports. + * The names of the constants are usually derived from the + * Unicode script property value aliases. + * See UAX #24 Unicode Script Property (http://www.unicode.org/reports/tr24/) + * and http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt . + * + * In addition, constants for many ISO 15924 script codes + * are included, for use with language tags, CLDR data, and similar. + * Some of those codes are not used in the Unicode Character Database (UCD). + * For example, there are no characters that have a UCD script property value of + * Hans or Hant. All Han ideographs have the Hani script property value in Unicode. + * + * Private-use codes Qaaa..Qabx are not included, except as used in the UCD or in CLDR. + * + * Starting with ICU 55, script codes are only added when their scripts + * have been or will certainly be encoded in Unicode, + * and have been assigned Unicode script property value aliases, + * to ensure that their script names are stable and match the names of the constants. + * Script codes like Latf and Aran that are not subject to separate encoding + * may be added at any time. + * + * @stable ICU 2.2 + */ +type code int32 + +/* + * Note: UScriptCode constants and their ISO script code comments + * are parsed by preparseucd.py. + * It matches lines like + * USCRIPT_ = , / * * / + */ + +const ( + /** @stable ICU 2.2 */ + invalidCode code = -1 + /** @stable ICU 2.2 */ + common code = 0 /* Zyyy */ + /** @stable ICU 2.2 */ + inherited code = 1 /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */ + /** @stable ICU 2.2 */ + arabic code = 2 /* Arab */ + /** @stable ICU 2.2 */ + armenian code = 3 /* Armn */ + /** @stable ICU 2.2 */ + bengali code = 4 /* Beng */ + /** @stable ICU 2.2 */ + bopomofo code = 5 /* Bopo */ + /** @stable ICU 2.2 */ + cherokee code = 6 /* Cher */ + /** @stable ICU 2.2 */ + coptic code = 7 /* Copt */ + /** @stable ICU 2.2 */ + cyrillic code = 8 /* Cyrl */ + /** @stable ICU 2.2 */ + deseret code = 9 /* Dsrt */ + /** @stable ICU 2.2 */ + devanagari code = 10 /* Deva */ + /** @stable ICU 2.2 */ + ethiopic code = 11 /* Ethi */ + /** @stable ICU 2.2 */ + georgian code = 12 /* Geor */ + /** @stable ICU 2.2 */ + gothic code = 13 /* Goth */ + /** @stable ICU 2.2 */ + greek code = 14 /* Grek */ + /** @stable ICU 2.2 */ + gujarati code = 15 /* Gujr */ + /** @stable ICU 2.2 */ + gurmukhi code = 16 /* Guru */ + /** @stable ICU 2.2 */ + han code = 17 /* Hani */ + /** @stable ICU 2.2 */ + hangul code = 18 /* Hang */ + /** @stable ICU 2.2 */ + hebrew code = 19 /* Hebr */ + /** @stable ICU 2.2 */ + hiragana code = 20 /* Hira */ + /** @stable ICU 2.2 */ + kannada code = 21 /* Knda */ + /** @stable ICU 2.2 */ + katakana code = 22 /* Kana */ + /** @stable ICU 2.2 */ + khmer code = 23 /* Khmr */ + /** @stable ICU 2.2 */ + lao code = 24 /* Laoo */ + /** @stable ICU 2.2 */ + latin code = 25 /* Latn */ + /** @stable ICU 2.2 */ + malayalam code = 26 /* Mlym */ + /** @stable ICU 2.2 */ + mongolian code = 27 /* Mong */ + /** @stable ICU 2.2 */ + myanmar code = 28 /* Mymr */ + /** @stable ICU 2.2 */ + ogham code = 29 /* Ogam */ + /** @stable ICU 2.2 */ + oldItalic code = 30 /* Ital */ + /** @stable ICU 2.2 */ + oriya code = 31 /* Orya */ + /** @stable ICU 2.2 */ + runic code = 32 /* Runr */ + /** @stable ICU 2.2 */ + sinhala code = 33 /* Sinh */ + /** @stable ICU 2.2 */ + syriac code = 34 /* Syrc */ + /** @stable ICU 2.2 */ + tamil code = 35 /* Taml */ + /** @stable ICU 2.2 */ + telugu code = 36 /* Telu */ + /** @stable ICU 2.2 */ + thaana code = 37 /* Thaa */ + /** @stable ICU 2.2 */ + thai code = 38 /* Thai */ + /** @stable ICU 2.2 */ + tibetan code = 39 /* Tibt */ + /** Canadian_Aboriginal script. @stable ICU 2.6 */ + canadianAboriginal code = 40 /* Cans */ + /** Canadian_Aboriginal script (alias). @stable ICU 2.2 */ + ucas code = canadianAboriginal + /** @stable ICU 2.2 */ + yi code = 41 /* Yiii */ + /* New scripts in Unicode 3.2 */ + /** @stable ICU 2.2 */ + tagalog code = 42 /* Tglg */ + /** @stable ICU 2.2 */ + hanunoo code = 43 /* Hano */ + /** @stable ICU 2.2 */ + buhid code = 44 /* Buhd */ + /** @stable ICU 2.2 */ + tagbanwa code = 45 /* Tagb */ + + /* New scripts in Unicode 4 */ + /** @stable ICU 2.6 */ + braille code = 46 /* Brai */ + /** @stable ICU 2.6 */ + cypriot code = 47 /* Cprt */ + /** @stable ICU 2.6 */ + limbu code = 48 /* Limb */ + /** @stable ICU 2.6 */ + linearB code = 49 /* Linb */ + /** @stable ICU 2.6 */ + osmanya code = 50 /* Osma */ + /** @stable ICU 2.6 */ + shavian code = 51 /* Shaw */ + /** @stable ICU 2.6 */ + taiLe code = 52 /* Tale */ + /** @stable ICU 2.6 */ + ugaratic code = 53 /* Ugar */ + + /** New script code in Unicode 4.0.1 @stable ICU 3.0 */ + katakanaOrHiragana = 54 /*Hrkt */ + + /* New scripts in Unicode 4.1 */ + /** @stable ICU 3.4 */ + buginese code = 55 /* Bugi */ + /** @stable ICU 3.4 */ + glagolitic code = 56 /* Glag */ + /** @stable ICU 3.4 */ + kharoshthi code = 57 /* Khar */ + /** @stable ICU 3.4 */ + sylotiNagri code = 58 /* Sylo */ + /** @stable ICU 3.4 */ + newTaiLue code = 59 /* Talu */ + /** @stable ICU 3.4 */ + tifinagh code = 60 /* Tfng */ + /** @stable ICU 3.4 */ + oldPersian code = 61 /* Xpeo */ + + /* New script codes from Unicode and ISO 15924 */ + /** @stable ICU 3.6 */ + balinese code = 62 /* Bali */ + /** @stable ICU 3.6 */ + batak code = 63 /* Batk */ + /** @stable ICU 3.6 */ + blissymbols code = 64 /* Blis */ + /** @stable ICU 3.6 */ + brahmi code = 65 /* Brah */ + /** @stable ICU 3.6 */ + cham code = 66 /* Cham */ + /** @stable ICU 3.6 */ + cirth code = 67 /* Cirt */ + /** @stable ICU 3.6 */ + oldChurchSlavonicCyrillic code = 68 /* Cyrs */ + /** @stable ICU 3.6 */ + demoticEgyptian code = 69 /* Egyd */ + /** @stable ICU 3.6 */ + hieraticEgyptian code = 70 /* Egyh */ + /** @stable ICU 3.6 */ + egyptianHieroglyphs code = 71 /* Egyp */ + /** @stable ICU 3.6 */ + khutsuri code = 72 /* Geok */ + /** @stable ICU 3.6 */ + simplfiedHan code = 73 /* Hans */ + /** @stable ICU 3.6 */ + traditionalHan code = 74 /* Hant */ + /** @stable ICU 3.6 */ + pahawhHmong code = 75 /* Hmng */ + /** @stable ICU 3.6 */ + oldHungarian code = 76 /* Hung */ + /** @stable ICU 3.6 */ + harappanIndus code = 77 /* Inds */ + /** @stable ICU 3.6 */ + javanese code = 78 /* Java */ + /** @stable ICU 3.6 */ + kayahLi code = 79 /* Kali */ + /** @stable ICU 3.6 */ + latinFraktur code = 80 /* Latf */ + /** @stable ICU 3.6 */ + latinGaelic code = 81 /* Latg */ + /** @stable ICU 3.6 */ + lepcha code = 82 /* Lepc */ + /** @stable ICU 3.6 */ + linearA code = 83 /* Lina */ + /** @stable ICU 4.6 */ + mandaic code = 84 /* Mand */ + /** @stable ICU 3.6 */ + mandaean code = mandaic + /** @stable ICU 3.6 */ + mayanHieroglyphs code = 85 /* Maya */ + /** @stable ICU 4.6 */ + meroiticHieroglyphs code = 86 /* Mero */ + /** @stable ICU 3.6 */ + meroitic code = meroiticHieroglyphs + /** @stable ICU 3.6 */ + nko code = 87 /* Nkoo */ + /** @stable ICU 3.6 */ + orkhon code = 88 /* Orkh */ + /** @stable ICU 3.6 */ + oldPermic code = 89 /* Perm */ + /** @stable ICU 3.6 */ + phagsPa code = 90 /* Phag */ + /** @stable ICU 3.6 */ + phoenician code = 91 /* Phnx */ + /** @stable ICU 52 */ + miao code = 92 /* Plrd */ + /** @stable ICU 3.6 */ + phoneticPollard code = miao + /** @stable ICU 3.6 */ + rongoRongo code = 93 /* Roro */ + /** @stable ICU 3.6 */ + sarati code = 94 /* Sara */ + /** @stable ICU 3.6 */ + extrangeloSyriac code = 95 /* Syre */ + /** @stable ICU 3.6 */ + westernSyriac code = 96 /* Syrj */ + /** @stable ICU 3.6 */ + easternSyriac code = 97 /* Syrn */ + /** @stable ICU 3.6 */ + tengwar code = 98 /* Teng */ + /** @stable ICU 3.6 */ + vai code = 99 /* Vaii */ + /** @stable ICU 3.6 */ + visibleSpeech code = 100 /* Visp */ + /** @stable ICU 3.6 */ + cuneiform code = 101 /* Xsux */ + /** @stable ICU 3.6 */ + unwrittenLanguages code = 102 /* Zxxx */ + /** @stable ICU 3.6 */ + unknown code = 103 /* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */ + + /** @stable ICU 3.8 */ + carian code = 104 /* Cari */ + /** @stable ICU 3.8 */ + japanese code = 105 /* Jpan */ + /** @stable ICU 3.8 */ + lanna code = 106 /* Lana */ + /** @stable ICU 3.8 */ + lycian code = 107 /* Lyci */ + /** @stable ICU 3.8 */ + lydian code = 108 /* Lydi */ + /** @stable ICU 3.8 */ + olChiki code = 109 /* Olck */ + /** @stable ICU 3.8 */ + rejang code = 110 /* Rjng */ + /** @stable ICU 3.8 */ + saurashtra code = 111 /* Saur */ + /** Sutton SignWriting @stable ICU 3.8 */ + signWriting code = 112 /* Sgnw */ + /** @stable ICU 3.8 */ + sundanese code = 113 /* Sund */ + /** @stable ICU 3.8 */ + moon code = 114 /* Moon */ + /** @stable ICU 3.8 */ + meiteiMayek code = 115 /* Mtei */ + + /** @stable ICU 4.0 */ + imperialAramaic code = 116 /* Armi */ + /** @stable ICU 4.0 */ + avestan code = 117 /* Avst */ + /** @stable ICU 4.0 */ + chakma code = 118 /* Cakm */ + /** @stable ICU 4.0 */ + korean code = 119 /* Kore */ + /** @stable ICU 4.0 */ + kaithi code = 120 /* Kthi */ + /** @stable ICU 4.0 */ + manichaean code = 121 /* Mani */ + /** @stable ICU 4.0 */ + inscriptionalPahlavi code = 122 /* Phli */ + /** @stable ICU 4.0 */ + psalterPahlavi code = 123 /* Phlp */ + /** @stable ICU 4.0 */ + bookPahlavi code = 124 /* Phlv */ + /** @stable ICU 4.0 */ + inscriptionalParthian code = 125 /* Prti */ + /** @stable ICU 4.0 */ + samaritan code = 126 /* Samr */ + /** @stable ICU 4.0 */ + taiViet code = 127 /* Tavt */ + /** @stable ICU 4.0 */ + mathematicalNotation code = 128 /* Zmth */ + /** @stable ICU 4.0 */ + symbols code = 129 /* Zsym */ + + /** @stable ICU 4.4 */ + bamum code = 130 /* Bamu */ + /** @stable ICU 4.4 */ + lisu code = 131 /* Lisu */ + /** @stable ICU 4.4 */ + nakhiGeba code = 132 /* Nkgb */ + /** @stable ICU 4.4 */ + oldSouthArabian code = 133 /* Sarb */ + + /** @stable ICU 4.6 */ + bassaVah code = 134 /* Bass */ + /** @stable ICU 54 */ + duployan code = 135 /* Dupl */ + /** @stable ICU 4.6 */ + elbasan code = 136 /* Elba */ + /** @stable ICU 4.6 */ + grantha code = 137 /* Gran */ + /** @stable ICU 4.6 */ + kpelle code = 138 /* Kpel */ + /** @stable ICU 4.6 */ + loma code = 139 /* Loma */ + /** Mende Kikakui @stable ICU 4.6 */ + mende code = 140 /* Mend */ + /** @stable ICU 4.6 */ + meroiticCursive code = 141 /* Merc */ + /** @stable ICU 4.6 */ + oldNorthArabian code = 142 /* Narb */ + /** @stable ICU 4.6 */ + nabataean code = 143 /* Nbat */ + /** @stable ICU 4.6 */ + palmyrene code = 144 /* Palm */ + /** @stable ICU 54 */ + khudawadi code = 145 /* Sind */ + /** @stable ICU 4.6 */ + sindhi code = khudawadi + /** @stable ICU 4.6 */ + warangCiti code = 146 /* Wara */ + + /** @stable ICU 4.8 */ + afaka code = 147 /* Afak */ + /** @stable ICU 4.8 */ + jurchen code = 148 /* Jurc */ + /** @stable ICU 4.8 */ + mro code = 149 /* Mroo */ + /** @stable ICU 4.8 */ + nushu code = 150 /* Nshu */ + /** @stable ICU 4.8 */ + sharada code = 151 /* Shrd */ + /** @stable ICU 4.8 */ + soraSompeng code = 152 /* Sora */ + /** @stable ICU 4.8 */ + takri code = 153 /* Takr */ + /** @stable ICU 4.8 */ + tangut code = 154 /* Tang */ + /** @stable ICU 4.8 */ + woleai code = 155 /* Wole */ + + /** @stable ICU 49 */ + anatolianHieroglyphs code = 156 /* Hluw */ + /** @stable ICU 49 */ + khojki code = 157 /* Khoj */ + /** @stable ICU 49 */ + tirhuta code = 158 /* Tirh */ + + /** @stable ICU 52 */ + caucasianAlbanian code = 159 /* Aghb */ + /** @stable ICU 52 */ + mahajani code = 160 /* Mahj */ + + /** @stable ICU 54 */ + ahom code = 161 /* Ahom */ + /** @stable ICU 54 */ + hatran code = 162 /* Hatr */ + /** @stable ICU 54 */ + modi code = 163 /* Modi */ + /** @stable ICU 54 */ + multani code = 164 /* Mult */ + /** @stable ICU 54 */ + pauCinHau code = 165 /* Pauc */ + /** @stable ICU 54 */ + siddham code = 166 /* Sidd */ + + /** @stable ICU 58 */ + adlam code = 167 /* Adlm */ + /** @stable ICU 58 */ + bhaiksuki code = 168 /* Bhks */ + /** @stable ICU 58 */ + marchen code = 169 /* Marc */ + /** @stable ICU 58 */ + newa code = 170 /* Newa */ + /** @stable ICU 58 */ + osage code = 171 /* Osge */ + + /** @stable ICU 58 */ + hanWithBopomofo code = 172 /* Hanb */ + /** @stable ICU 58 */ + jamo code = 173 /* Jamo */ + /** @stable ICU 58 */ + symbolsEmoji code = 174 /* Zsye */ + + /** @stable ICU 60 */ + masaramGondi code = 175 /* Gonm */ + /** @stable ICU 60 */ + soyombo code = 176 /* Soyo */ + /** @stable ICU 60 */ + zanabazarSquare code = 177 /* Zanb */ + + /** @stable ICU 62 */ + dogra code = 178 /* Dogr */ + /** @stable ICU 62 */ + gunjalaGondi code = 179 /* Gong */ + /** @stable ICU 62 */ + makasar code = 180 /* Maka */ + /** @stable ICU 62 */ + medefaidrin code = 181 /* Medf */ + /** @stable ICU 62 */ + hanifiRohingya code = 182 /* Rohg */ + /** @stable ICU 62 */ + sogdian code = 183 /* Sogd */ + /** @stable ICU 62 */ + oldSogdian code = 184 /* Sogo */ + + /** @stable ICU 64 */ + elymaic code = 185 /* Elym */ + /** @stable ICU 64 */ + nyiakengPuachueHmong code = 186 /* Hmnp */ + /** @stable ICU 64 */ + nandinagari code = 187 /* Nand */ + /** @stable ICU 64 */ + wancho code = 188 /* Wcho */ + + /** @stable ICU 66 */ + chorasmian code = 189 /* Chrs */ + /** @stable ICU 66 */ + divesAkuru code = 190 /* Diak */ + /** @stable ICU 66 */ + khitanSmallScript code = 191 /* Kits */ + /** @stable ICU 66 */ + yezedi code = 192 /* Yezi */ +) + +func uscriptHasScript(c rune, sc code) bool { + scriptX := uchar.GetUnicodeProperties(c, 0) & scriptXMask + codeOrIndex := mergeScriptCodeOrIndex(scriptX) + if scriptX < scriptXWithCommon { + return sc == code(codeOrIndex) + } + + scx := uchar.ScriptExtensions(codeOrIndex) + if scriptX >= scriptXWithOther { + scx = uchar.ScriptExtensions(uint32(scx[1])) + } + sc32 := uint32(sc) + if sc32 > 0x7fff { + /* Guard against bogus input that would make us go past the Script_Extensions terminator. */ + return false + } + for sc32 > uint32(scx[0]) { + scx = scx[1:] + } + return sc32 == uint32(scx[0]&0x7fff) +} diff --git a/go/mysql/icuregex/internal/uset/close.go b/go/mysql/icuregex/internal/uset/close.go new file mode 100644 index 00000000000..bd3f9f0f7e3 --- /dev/null +++ b/go/mysql/icuregex/internal/uset/close.go @@ -0,0 +1,96 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + +type USet uint32 + +const ( + /** + * Ignore white space within patterns unless quoted or escaped. + * @stable ICU 2.4 + */ + IgnoreSpace USet = 1 + + /** + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This performs a full + * closure over case mappings, e.g. U+017F for s. + * + * The resulting set is a superset of the input for the code points but + * not for the strings. + * It performs a case mapping closure of the code points and adds + * full case folding strings for the code points, and reduces strings of + * the original set to their full case folding equivalents. + * + * This is designed for case-insensitive matches, for example + * in regular expressions. The full code point case closure allows checking of + * an input character directly against the closure set. + * Strings are matched by comparing the case-folded form from the closure + * set with an incremental case folding of the string in question. + * + * The closure set will also contain single code points if the original + * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). + * This is not necessary (that is, redundant) for the above matching method + * but results in the same closure sets regardless of whether the original + * set contained the code point or a string. + * + * @stable ICU 2.4 + */ + CaseInsensitive USet = 2 + + /** + * Enable case insensitive matching. E.g., "[ab]" with this flag + * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will + * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, + * title-, and uppercase mappings as well as the case folding + * of each existing element in the set. + * @stable ICU 3.2 + */ + AddCaseMappings USet = 4 +) + +func (u *UnicodeSet) CloseOver(attribute USet) { + if attribute&AddCaseMappings != 0 { + panic("USET_ADD_CASE_MAPPINGS is unsupported") + } + if (attribute & CaseInsensitive) == 0 { + return + } + + foldSet := u.Clone() + n := u.RangeCount() + + for i := 0; i < n; i++ { + start := u.RangeStart(i) + end := u.RangeEnd(i) + + // full case closure + for cp := start; cp <= end; cp++ { + ucase.AddCaseClosure(cp, foldSet) + } + } + + *u = *foldSet +} diff --git a/go/mysql/icuregex/internal/uset/frozen.go b/go/mysql/icuregex/internal/uset/frozen.go new file mode 100644 index 00000000000..2703a4f6975 --- /dev/null +++ b/go/mysql/icuregex/internal/uset/frozen.go @@ -0,0 +1,339 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +type frozen struct { + // One byte 0 or 1 per Latin-1 character. + latin1Contains [0x100]byte + + // true if contains(U+FFFD) + containsFFFD bool + + /* + * One bit per code point from U+0000..U+07FF. + * The bits are organized vertically; consecutive code points + * correspond to the same bit positions in consecutive table words. + * With code point parts + * lead=c{10..6} + * trail=c{5..0} + * it is set.contains(c)==(table7FF[trail] bit lead) + * + * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) + * for faster validity checking at runtime. + */ + table7FF [64]uint32 + + /* + * One bit per 64 BMP code points. + * The bits are organized vertically; consecutive 64-code point blocks + * correspond to the same bit position in consecutive table words. + * With code point parts + * lead=c{15..12} + * t1=c{11..6} + * test bits (lead+16) and lead in bmpBlockBits[t1]. + * If the upper bit is 0, then the lower bit indicates if contains(c) + * for all code points in the 64-block. + * If the upper bit is 1, then the block is mixed and set.contains(c) + * must be called. + * + * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to + * the result of contains(FFFD) for faster validity checking at runtime. + */ + bmpBlockBits [64]uint32 + + /* + * Inversion list indexes for restricted binary searches in + * findCodePoint(), from + * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000). + * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are + * always looked up in the bit tables. + * The last pair of indexes is for finding supplementary code points. + */ + list4kStarts [18]int32 +} + +func freeze(list []rune) *frozen { + f := &frozen{} + + listEnd := int32(len(list) - 1) + + f.list4kStarts[0] = f.findCodePoint(list, 0x800, 0, listEnd) + for i := 1; i <= 0x10; i++ { + f.list4kStarts[i] = f.findCodePoint(list, rune(i)<<12, f.list4kStarts[i-1], listEnd) + } + f.list4kStarts[0x11] = listEnd + f.containsFFFD = f.containsSlow(list, 0xfffd, f.list4kStarts[0xf], f.list4kStarts[0x10]) + + f.initBits(list) + f.overrideIllegal() + + return f +} + +func (f *frozen) containsSlow(list []rune, c rune, lo, hi int32) bool { + return (f.findCodePoint(list, c, lo, hi) & 1) != 0 +} + +func (f *frozen) findCodePoint(list []rune, c rune, lo, hi int32) int32 { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:Any:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if c < list[lo] { + return lo + } + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + if lo >= hi || c >= list[hi-1] { + return hi + } + // invariant: c >= list[lo] + // invariant: c < list[hi] + for { + i := (lo + hi) >> 1 + if i == lo { + break // Found! + } else if c < list[i] { + hi = i + } else { + lo = i + } + } + return hi +} + +func (f *frozen) set32x64bits(table *[64]uint32, start, limit int32) { + lead := start >> 6 // Named for UTF-8 2-byte lead byte with upper 5 bits. + trail := start & 0x3f // Named for UTF-8 2-byte trail byte with lower 6 bits. + + // Set one bit indicating an all-one block. + bits := uint32(1) << lead + if (start + 1) == limit { // Single-character shortcut. + table[trail] |= bits + return + } + + limitLead := limit >> 6 + limitTrail := limit & 0x3f + + if lead == limitLead { + // Partial vertical bit column. + for trail < limitTrail { + table[trail] |= bits + trail++ + } + } else { + // Partial vertical bit column, + // followed by a bit rectangle, + // followed by another partial vertical bit column. + if trail > 0 { + for { + table[trail] |= bits + trail++ + if trail >= 64 { + break + } + } + lead++ + } + if lead < limitLead { + bits = ^((uint32(1) << lead) - 1) + if limitLead < 0x20 { + bits &= (uint32(1) << limitLead) - 1 + } + for trail = 0; trail < 64; trail++ { + table[trail] |= bits + } + } + // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. + // In that case, bits=1<= 0x100 { + break + } + for { + f.latin1Contains[start] = 1 + start++ + if start >= limit || start >= 0x100 { + break + } + } + if limit > 0x100 { + break + } + } + + // Find the first range overlapping with (or after) 80..FF again, + // to include them in table7FF as well. + listIndex = 0 + for { + start = list[listIndex] + listIndex++ + if listIndex < len(list) { + limit = list[listIndex] + listIndex++ + } else { + limit = 0x110000 + } + if limit > 0x80 { + if start < 0x80 { + start = 0x80 + } + break + } + } + + // Set table7FF[]. + for start < 0x800 { + var end rune + if limit <= 0x800 { + end = limit + } else { + end = 0x800 + } + f.set32x64bits(&f.table7FF, start, end) + if limit > 0x800 { + start = 0x800 + break + } + + start = list[listIndex] + listIndex++ + if listIndex < len(list) { + limit = list[listIndex] + listIndex++ + } else { + limit = 0x110000 + } + } + + // Set bmpBlockBits[]. + minStart := rune(0x800) + for start < 0x10000 { + if limit > 0x10000 { + limit = 0x10000 + } + + if start < minStart { + start = minStart + } + if start < limit { // Else: Another range entirely in a known mixed-value block. + if (start & 0x3f) != 0 { + // Mixed-value block of 64 code points. + start >>= 6 + f.bmpBlockBits[start&0x3f] |= 0x10001 << (start >> 6) + start = (start + 1) << 6 // Round up to the next block boundary. + minStart = start // Ignore further ranges in this block. + } + if start < limit { + if start < (limit &^ 0x3f) { + // Multiple all-ones blocks of 64 code points each. + f.set32x64bits(&f.bmpBlockBits, start>>6, limit>>6) + } + + if (limit & 0x3f) != 0 { + // Mixed-value block of 64 code points. + limit >>= 6 + f.bmpBlockBits[limit&0x3f] |= 0x10001 << (limit >> 6) + limit = (limit + 1) << 6 // Round up to the next block boundary. + minStart = limit // Ignore further ranges in this block. + } + } + } + + if limit == 0x10000 { + break + } + + start = list[listIndex] + listIndex++ + if listIndex < len(list) { + limit = list[listIndex] + listIndex++ + } else { + limit = 0x110000 + } + } +} diff --git a/go/mysql/icuregex/internal/uset/pattern.go b/go/mysql/icuregex/internal/uset/pattern.go new file mode 100644 index 00000000000..20b44da9c6d --- /dev/null +++ b/go/mysql/icuregex/internal/uset/pattern.go @@ -0,0 +1,107 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "strings" + + "vitess.io/vitess/go/mysql/icuregex/internal/pattern" +) + +func (u *UnicodeSet) String() string { + var buf strings.Builder + u.ToPattern(&buf, true) + return buf.String() +} + +func (u *UnicodeSet) ToPattern(w *strings.Builder, escapeUnprintable bool) { + w.WriteByte('[') + + // // Check against the predefined categories. We implicitly build + // // up ALL category sets the first time toPattern() is called. + // for (int8_t cat=0; cat 1 && u.RangeStart(0) == MinValue && u.RangeEnd(count-1) == MaxValue { + + // Emit the inverse + w.WriteByte('^') + + for i := 1; i < count; i++ { + start := u.RangeEnd(i-1) + 1 + end := u.RangeStart(i) - 1 + u.appendToPattern(w, start, escapeUnprintable) + if start != end { + if (start + 1) != end { + w.WriteByte('-') + } + u.appendToPattern(w, end, escapeUnprintable) + } + } + } else { + // Default; emit the ranges as pairs + for i := 0; i < count; i++ { + start := u.RangeStart(i) + end := u.RangeEnd(i) + u.appendToPattern(w, start, escapeUnprintable) + if start != end { + if (start + 1) != end { + w.WriteByte('-') + } + u.appendToPattern(w, end, escapeUnprintable) + } + } + } + + w.WriteByte(']') +} + +func (u *UnicodeSet) appendToPattern(w *strings.Builder, c rune, escapeUnprintable bool) { + if escapeUnprintable && pattern.IsUnprintable(c) { + // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything + // unprintable + pattern.EscapeUnprintable(w, c) + return + } + + // Okay to let ':' pass through + switch c { + case '[', ']', '-', '^', '&', '\\', '{', '}', ':', '$': + w.WriteByte('\\') + default: + // Escape whitespace + if pattern.IsWhitespace(c) { + w.WriteByte('\\') + } + } + w.WriteRune(c) +} diff --git a/go/mysql/icuregex/internal/uset/unicode_set.go b/go/mysql/icuregex/internal/uset/unicode_set.go new file mode 100644 index 00000000000..3dba317eab2 --- /dev/null +++ b/go/mysql/icuregex/internal/uset/unicode_set.go @@ -0,0 +1,694 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "fmt" + + "golang.org/x/exp/slices" +) + +// HIGH_VALUE > all valid values. 110000 for codepoints +const unicodeSetHigh = 0x0110000 + +// LOW <= all valid values. ZERO for codepoints +const unicodeSetLow = 0x000000 + +const ( + /** + * Minimum value that can be stored in a UnicodeSet. + * @stable ICU 2.4 + */ + MinValue = 0 + + /** + * Maximum value that can be stored in a UnicodeSet. + * @stable ICU 2.4 + */ + MaxValue = 0x10ffff +) + +type UnicodeSet struct { + list []rune + buffer []rune + frozen *frozen +} + +func New() *UnicodeSet { + buf := make([]rune, 1, 25) + buf[0] = unicodeSetHigh + return &UnicodeSet{list: buf} +} + +func FromRunes(list []rune) *UnicodeSet { + return &UnicodeSet{list: list} +} + +func (u *UnicodeSet) ensureBufferCapacity(c int) { + if cap(u.buffer) < c { + u.buffer = make([]rune, c) + return + } + u.buffer = u.buffer[:cap(u.buffer)] +} + +func (u *UnicodeSet) addbuffer(other []rune, polarity int8) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + u.ensureBufferCapacity(len(u.list) + len(other)) + + i := 1 + j := 1 + k := 0 + + a := u.list[0] + b := other[0] + + for { + switch polarity { + case 0: + if a < b { + if k > 0 && a <= u.buffer[k-1] { + k-- + a = max(u.list[i], u.buffer[k]) + } else { + u.buffer[k] = a + k++ + a = u.list[i] + } + i++ + polarity ^= 1 + } else if b < a { + if k > 0 && b <= u.buffer[k-1] { + k-- + b = max(other[j], u.buffer[k]) + } else { + u.buffer[k] = b + k++ + b = other[j] + } + j++ + polarity ^= 2 + } else { + if a == unicodeSetHigh { + goto loopEnd + } + if k > 0 && a <= u.buffer[k-1] { + k-- + a = max(u.list[i], u.buffer[k]) + } else { + u.buffer[k] = a + k++ + a = u.list[i] + } + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 3: + if b <= a { + if a == unicodeSetHigh { + goto loopEnd + } + u.buffer[k] = a + k++ + } else { + if b == unicodeSetHigh { + goto loopEnd + } + u.buffer[k] = b + k++ + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + case 1: + if a < b { + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { + b = other[j] + j++ + polarity ^= 2 + } else { + if a == unicodeSetHigh { + goto loopEnd + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 2: + if b < a { + u.buffer[k] = b + k++ + b = other[j] + j++ + polarity ^= 2 + } else if a < b { + a = u.list[i] + i++ + polarity ^= 1 + } else { + if a == unicodeSetHigh { + goto loopEnd + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + } + } + +loopEnd: + u.buffer[k] = unicodeSetHigh + k++ + + u.list, u.buffer = u.buffer[:k], u.list +} + +func max(a, b rune) rune { + if a > b { + return a + } + return b +} + +func pinCodePoint(c *rune) rune { + if *c < unicodeSetLow { + *c = unicodeSetLow + } else if *c > (unicodeSetHigh - 1) { + *c = unicodeSetHigh - 1 + } + return *c +} + +func (u *UnicodeSet) AddRune(c rune) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + + // find smallest i such that c < list[i] + // if odd, then it is IN the set + // if even, then it is OUT of the set + i := u.findCodePoint(pinCodePoint(&c)) + + // already in set? + if (i & 1) != 0 { + return + } + + // HIGH is 0x110000 + // assert(list[len-1] == HIGH); + + // empty = [HIGH] + // [start_0, limit_0, start_1, limit_1, HIGH] + + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // i == 0 means c is before the first range + if c == u.list[i]-1 { + // c is before start of next range + u.list[i] = c + // if we touched the HIGH mark, then add a new one + if c == (unicodeSetHigh - 1) { + u.list = append(u.list, unicodeSetHigh) + } + if i > 0 && c == u.list[i-1] { + // collapse adjacent ranges + + // [..., start_k-1, c, c, limit_k, ..., HIGH] + // ^ + // list[i] + for k := i - 1; k < len(u.list)-2; k++ { + u.list[k] = u.list[k+2] + } + u.list = u.list[:len(u.list)-2] + } + } else if i > 0 && c == u.list[i-1] { + // c is after end of prior range + u.list[i-1]++ + // no need to check for collapse here + } else { + // At this point we know the new char is not adjacent to + // any existing ranges, and it is not 10FFFF. + + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + u.list = slices.Insert(u.list, i, c, c+1) + } +} + +func (u *UnicodeSet) AddRuneRange(start, end rune) { + if pinCodePoint(&start) < pinCodePoint(&end) { + limit := end + 1 + // Fast path for adding a new range after the last one. + // Odd list length: [..., lastStart, lastLimit, HIGH] + if (len(u.list) & 1) != 0 { + // If the list is empty, set lastLimit low enough to not be adjacent to 0. + var lastLimit rune + if len(u.list) == 1 { + lastLimit = -2 + } else { + lastLimit = u.list[len(u.list)-2] + } + if lastLimit <= start { + if lastLimit == start { + // Extend the last range. + u.list[len(u.list)-2] = limit + if limit == unicodeSetHigh { + u.list = u.list[:len(u.list)-1] + } + } else { + u.list[len(u.list)-1] = start + if limit < unicodeSetHigh { + u.list = append(u.list, limit) + u.list = append(u.list, unicodeSetHigh) + } else { // limit == UNICODESET_HIGH + u.list = append(u.list, unicodeSetHigh) + } + } + return + } + } + // This is slow. Could be much faster using findCodePoint(start) + // and modifying the list, dealing with adjacent & overlapping ranges. + addRange := [3]rune{start, limit, unicodeSetHigh} + u.addbuffer(addRange[:], 0) + } else if start == end { + u.AddRune(start) + } +} + +func (u *UnicodeSet) AddAll(u2 *UnicodeSet) { + if len(u2.list) > 0 { + u.addbuffer(u2.list, 0) + } +} + +func (u *UnicodeSet) Complement() { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + if u.list[0] == unicodeSetLow { + copy(u.list, u.list[1:]) + u.list = u.list[:len(u.list)-1] + } else { + u.list = slices.Insert(u.list, 0, unicodeSetLow) + } +} + +func (u *UnicodeSet) RemoveRuneRange(start, end rune) { + if pinCodePoint(&start) < pinCodePoint(&end) { + r := [3]rune{start, end + 1, unicodeSetHigh} + u.retain(r[:], 2) + } +} + +func (u *UnicodeSet) RemoveAll(c *UnicodeSet) { + u.retain(c.list, 2) +} + +func (u *UnicodeSet) RetainAll(c *UnicodeSet) { + u.retain(c.list, 0) +} + +func (u *UnicodeSet) retain(other []rune, polarity int8) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + + u.ensureBufferCapacity(len(u.list) + len(other)) + + i := 1 + j := 1 + k := 0 + + a := u.list[0] + b := other[0] + + // change from xor is that we have to check overlapping pairs + // polarity bit 1 means a is second, bit 2 means b is. + for { + switch polarity { + case 0: // both first; drop the smaller + if a < b { // drop a + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { // drop b + b = other[j] + j++ + polarity ^= 2 + } else { // a == b, take one, drop other + if a == unicodeSetHigh { + goto loop_end + } + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 3: // both second; take lower if unequal + if a < b { // take a + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { // take b + u.buffer[k] = b + k++ + b = other[j] + j++ + polarity ^= 2 + } else { // a == b, take one, drop other + if a == unicodeSetHigh { + goto loop_end + } + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 1: // a second, b first; + if a < b { // NO OVERLAP, drop a + a = u.list[i] + i++ + polarity ^= 1 + } else if b < a { // OVERLAP, take b + u.buffer[k] = b + k++ + b = other[j] + j++ + polarity ^= 2 + } else { // a == b, drop both! + if a == unicodeSetHigh { + goto loop_end + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + case 2: // a first, b second; if a < b, overlap + if b < a { // no overlap, drop b + b = other[j] + j++ + polarity ^= 2 + } else if a < b { // OVERLAP, take a + u.buffer[k] = a + k++ + a = u.list[i] + i++ + polarity ^= 1 + } else { // a == b, drop both! + if a == unicodeSetHigh { + goto loop_end + } + a = u.list[i] + i++ + polarity ^= 1 + b = other[j] + j++ + polarity ^= 2 + } + } + } + +loop_end: + u.buffer[k] = unicodeSetHigh // terminate + k++ + u.list, u.buffer = u.buffer[:k], u.list +} + +func (u *UnicodeSet) Clear() { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + u.list = u.list[:1] + u.list[0] = unicodeSetHigh +} + +func (u *UnicodeSet) Len() (n int) { + count := u.RangeCount() + for i := 0; i < count; i++ { + n += int(u.RangeEnd(i)) - int(u.RangeStart(i)) + 1 + } + return +} + +func (u *UnicodeSet) RangeCount() int { + return len(u.list) / 2 +} + +func (u *UnicodeSet) RangeStart(idx int) rune { + return u.list[idx*2] +} + +func (u *UnicodeSet) RangeEnd(idx int) rune { + return u.list[idx*2+1] - 1 +} + +func (u *UnicodeSet) RuneAt(idx int) rune { + if idx >= 0 { + // len2 is the largest even integer <= len, that is, it is len + // for even values and len-1 for odd values. With odd values + // the last entry is UNICODESET_HIGH. + len2 := len(u.list) + if (len2 & 0x1) != 0 { + len2-- + } + + var i int + for i < len2 { + start := u.list[i] + count := int(u.list[i+1] - start) + i += 2 + if idx < count { + return start + rune(idx) + } + idx -= count + } + } + return -1 +} + +func (u *UnicodeSet) ContainsRune(c rune) bool { + if f := u.frozen; f != nil { + if c < 0 { + return false + } else if c <= 0xff { + return f.latin1Contains[c] != 0 + } else if c <= 0x7ff { + return (f.table7FF[c&0x3f] & (uint32(1) << (c >> 6))) != 0 + } else if c < 0xd800 || (c >= 0xe000 && c <= 0xffff) { + lead := c >> 12 + twoBits := (f.bmpBlockBits[(c>>6)&0x3f] >> lead) & 0x10001 + if twoBits <= 1 { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + return twoBits != 0 + } + // Look up the code point in its 4k block of code points. + return f.containsSlow(u.list, c, f.list4kStarts[lead], f.list4kStarts[lead+1]) + } else if c <= 0x10ffff { + // surrogate or supplementary code point + return f.containsSlow(u.list, c, f.list4kStarts[0xd], f.list4kStarts[0x11]) + } + // Out-of-range code points get FALSE, consistent with long-standing + // behavior of UnicodeSet::contains(c). + return false + } + + if c >= unicodeSetHigh { + return false + } + i := u.findCodePoint(c) + return (i & 1) != 0 +} + +func (u *UnicodeSet) ContainsRuneRange(from, to rune) bool { + i := u.findCodePoint(from) + return (i&1) != 0 && to < u.list[i] +} + +func (u *UnicodeSet) findCodePoint(c rune) int { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:Any:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if c < u.list[0] { + return 0 + } + + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + lo := 0 + hi := len(u.list) - 1 + if lo >= hi || c >= u.list[hi-1] { + return hi + } + + // invariant: c >= list[lo] + // invariant: c < list[hi] + for { + i := (lo + hi) >> 1 + if i == lo { + break // Found! + } else if c < u.list[i] { + hi = i + } else { + lo = i + } + } + return hi +} + +func (u *UnicodeSet) AddString(chars string) { + for _, c := range chars { + u.AddRune(c) + } +} + +type Filter func(ch rune) bool + +func (u *UnicodeSet) ApplyFilter(inclusions *UnicodeSet, filter Filter) { + // Logically, walk through all Unicode characters, noting the start + // and end of each range for which filter.contain(c) is + // true. Add each range to a set. + // + // To improve performance, use an inclusions set which + // encodes information about character ranges that are known + // to have identical properties. + // inclusions contains the first characters of + // same-value ranges for the given property. + + u.Clear() + + startHasProperty := rune(-1) + limitRange := inclusions.RangeCount() + + for j := 0; j < limitRange; j++ { + // get current range + start := inclusions.RangeStart(j) + end := inclusions.RangeEnd(j) + + // for all the code points in the range, process + for ch := start; ch <= end; ch++ { + // only add to this UnicodeSet on inflection points -- + // where the hasProperty value changes to false + if filter(ch) { + if startHasProperty < 0 { + startHasProperty = ch + } + } else if startHasProperty >= 0 { + u.AddRuneRange(startHasProperty, ch-1) + startHasProperty = -1 + } + } + } + if startHasProperty >= 0 { + u.AddRuneRange(startHasProperty, 0x10FFFF) + } +} + +func (u *UnicodeSet) Clone() *UnicodeSet { + return &UnicodeSet{list: slices.Clone(u.list)} +} + +func (u *UnicodeSet) IsEmpty() bool { + return len(u.list) == 1 +} + +func (u *UnicodeSet) CopyFrom(set *UnicodeSet) { + if u.frozen != nil { + panic("UnicodeSet is frozen") + } + u.list = slices.Clone(set.list) +} + +func (u *UnicodeSet) Equals(other *UnicodeSet) bool { + return slices.Equal(u.list, other.list) +} + +func (u *UnicodeSet) Freeze() *UnicodeSet { + u.frozen = freeze(u.list) + return u +} + +func (u *UnicodeSet) FreezeCheck_() error { + if u == nil { + return nil + } + if u.frozen == nil { + return fmt.Errorf("UnicodeSet is not frozen") + } + for r := rune(0); r <= 0x10ffff; r++ { + want := (u.findCodePoint(r) & 1) != 0 + got := u.ContainsRune(r) + if want != got { + return fmt.Errorf("rune '%c' (U+%04X) did not freeze", r, r) + } + } + return nil +} diff --git a/go/mysql/icuregex/internal/uset/unicode_set_test.go b/go/mysql/icuregex/internal/uset/unicode_set_test.go new file mode 100644 index 00000000000..908abd8889d --- /dev/null +++ b/go/mysql/icuregex/internal/uset/unicode_set_test.go @@ -0,0 +1,43 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uset + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSimpleBelong(t *testing.T) { + ss1 := New() + ss1.AddString("*?+[(){}^$|\\.") + ss2 := New() + ss2.AddString("*?+[(){}^$|\\.") + ss2.Complement() + ss3 := New() + ss3.AddRune('*') + ss3.AddRune('?') + + assert.True(t, ss1.ContainsRune('(')) + assert.False(t, ss2.ContainsRune('(')) + assert.True(t, ss3.ContainsRune('*')) +} diff --git a/go/mysql/icuregex/internal/utf16/helpers.go b/go/mysql/icuregex/internal/utf16/helpers.go new file mode 100644 index 00000000000..bdf53ae731c --- /dev/null +++ b/go/mysql/icuregex/internal/utf16/helpers.go @@ -0,0 +1,65 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utf16 + +import "unicode/utf16" + +func IsLead(c rune) bool { + return (uint32(c) & 0xfffffc00) == 0xd800 +} + +func IsTrail(c rune) bool { + return (uint32(c) & 0xfffffc00) == 0xdc00 +} + +/** + * Is this code point a surrogate (U+d800..U+dfff)? + * @param c 32-bit code point + * @return true or false + * @stable ICU 2.4 + */ +func IsSurrogate(c rune) bool { + return (uint32(c) & 0xfffff800) == 0xd800 +} + +/** + * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), + * is it a lead surrogate? + * @param c 32-bit code point + * @return true or false + * @stable ICU 2.4 + */ +func IsSurrogateLead(c rune) bool { + return (uint32(c) & 0x400) == 0 +} + +func DecodeRune(a, b rune) rune { + return utf16.DecodeRune(a, b) +} + +func NextUnsafe(s []uint16) (rune, []uint16) { + c := rune(s[0]) + if !IsLead(c) { + return c, s[1:] + } + return DecodeRune(c, rune(s[1])), s[2:] +} diff --git a/go/mysql/icuregex/internal/utrie/ucptrie.go b/go/mysql/icuregex/internal/utrie/ucptrie.go new file mode 100644 index 00000000000..74e4eb9b2fa --- /dev/null +++ b/go/mysql/icuregex/internal/utrie/ucptrie.go @@ -0,0 +1,708 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utrie + +import ( + "errors" + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/udata" +) + +type UcpTrie struct { + index []uint16 + data8 []uint8 + data16 []uint16 + data32 []uint32 + + indexLength, dataLength int32 + /** Start of the last range which ends at U+10FFFF. @internal */ + highStart rune + shifted12HighStart uint16 + + typ ucpTrieType + valueWidth ucpTrieValueWidth + + /** + * Internal index-3 null block offset. + * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. + * @internal + */ + index3NullOffset uint16 + /** + * Internal data null block offset, not shifted. + * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. + * @internal + */ + dataNullOffset int32 + + nullValue uint32 +} + +/** + * Selectors for the type of a UCPTrie. + * Different trade-offs for size vs. speed. + * + * @see umutablecptrie_buildImmutable + * @see ucptrie_openFromBinary + * @see ucptrie_getType + * @stable ICU 63 + */ +type ucpTrieType int8 + +const ( + /** + * For ucptrie_openFromBinary() to accept any type. + * ucptrie_getType() will return the actual type. + * @stable ICU 63 + */ + typeAny ucpTrieType = iota - 1 + /** + * Fast/simple/larger BMP data structure. Use functions and "fast" macros. + * @stable ICU 63 + */ + typeFast + /** + * Small/slower BMP data structure. Use functions and "small" macros. + * @stable ICU 63 + */ + typeSmall +) + +/** + * Selectors for the number of bits in a UCPTrie data value. + * + * @see umutablecptrie_buildImmutable + * @see ucptrie_openFromBinary + * @see ucptrie_getValueWidth + * @stable ICU 63 + */ +type ucpTrieValueWidth int8 + +const ( + /** + * For ucptrie_openFromBinary() to accept any data value width. + * ucptrie_getValueWidth() will return the actual data value width. + * @stable ICU 63 + */ + valueBitsAny ucpTrieValueWidth = iota - 1 + /** + * The trie stores 16 bits per data value. + * It returns them as unsigned values 0..0xffff=65535. + * @stable ICU 63 + */ + valueBits16 + /** + * The trie stores 32 bits per data value. + * @stable ICU 63 + */ + valueBits32 + /** + * The trie stores 8 bits per data value. + * It returns them as unsigned values 0..0xff=255. + * @stable ICU 63 + */ + valueBits8 +) + +const ucpTrieSig = 0x54726933 +const ucpTrieOESig = 0x33697254 + +/** + * Constants for use with UCPTrieHeader.options. + * @internal + */ +const ( + optionsDataLengthMask = 0xf000 + optionsDataNullOffsetMask = 0xf00 + optionsReservedMask = 0x38 + optionsValueBitsMask = 7 +) + +const ( + /** @internal */ + fastShift = 6 + + /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ + fastDataBlockLength = 1 << fastShift + + /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ + fastDataMask = fastDataBlockLength - 1 + + /** @internal */ + smallMax = 0xfff + + /** + * Offset from dataLength (to be subtracted) for fetching the + * value returned for out-of-range code points and ill-formed UTF-8/16. + * @internal + */ + errorValueNegDataOffset = 1 + /** + * Offset from dataLength (to be subtracted) for fetching the + * value returned for code points highStart..U+10FFFF. + * @internal + */ + highValueNegDataOffset = 2 +) + +// Internal constants. +const ( + /** The length of the BMP index table. 1024=0x400 */ + bmpIndexLength = 0x10000 >> fastShift + + smallLimit = 0x1000 + smallIndexLength = smallLimit >> fastShift + + /** Shift size for getting the index-3 table offset. */ + ucpShift3 = 4 + + /** Shift size for getting the index-2 table offset. */ + ucpShift2 = 5 + ucpShift3 + + /** Shift size for getting the index-1 table offset. */ + ucpShift1 = 5 + ucpShift2 + + /** + * Difference between two shift sizes, + * for getting an index-2 offset from an index-3 offset. 5=9-4 + */ + ucpShift2Min3 = ucpShift2 - ucpShift3 + + /** + * Difference between two shift sizes, + * for getting an index-1 offset from an index-2 offset. 5=14-9 + */ + ucpShift1Min2 = ucpShift1 - ucpShift2 + + /** + * Number of index-1 entries for the BMP. (4) + * This part of the index-1 table is omitted from the serialized form. + */ + ucpOmittedBmpIndex1Length = 0x10000 >> ucpShift1 + + /** Number of entries in an index-2 block. 32=0x20 */ + ucpIndex2BlockLength = 1 << ucpShift1Min2 + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + ucpIndex2Mask = ucpIndex2BlockLength - 1 + + /** Number of code points per index-2 table entry. 512=0x200 */ + ucpCpPerIndex2Entry = 1 << ucpShift2 + + /** Number of entries in an index-3 block. 32=0x20 */ + ucpIndex3BlockLength = 1 << ucpShift2Min3 + + /** Mask for getting the lower bits for the in-index-3-block offset. */ + ucpIndex3Mask = ucpIndex3BlockLength - 1 + + /** Number of entries in a small data block. 16=0x10 */ + ucpSmallDataBlockLength = 1 << ucpShift3 + + /** Mask for getting the lower bits for the in-small-data-block offset. */ + ucpSmallDataMask = ucpSmallDataBlockLength - 1 +) + +func UcpTrieFromBytes(bytes *udata.Bytes) (*UcpTrie, error) { + type ucpHeader struct { + /** "Tri3" in big-endian US-ASCII (0x54726933) */ + signature uint32 + + /** + * Options bit field: + * Bits 15..12: Data length bits 19..16. + * Bits 11..8: Data null block offset bits 19..16. + * Bits 7..6: UCPTrieType + * Bits 5..3: Reserved (0). + * Bits 2..0: UCPTrieValueWidth + */ + options uint16 + + /** Total length of the index tables. */ + indexLength uint16 + + /** Data length bits 15..0. */ + dataLength uint16 + + /** Index-3 null block offset, 0x7fff or 0xffff if none. */ + index3NullOffset uint16 + + /** Data null block offset bits 15..0, 0xfffff if none. */ + dataNullOffset uint16 + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UCPTRIE_SHIFT_2. + */ + shiftedHighStart uint16 + } + + var header ucpHeader + header.signature = bytes.Uint32() + + switch header.signature { + case ucpTrieSig: + case ucpTrieOESig: + return nil, errors.New("unsupported: BigEndian encoding") + default: + return nil, fmt.Errorf("invalid signature for UcpTrie: 0x%08x", header.signature) + } + + header.options = bytes.Uint16() + header.indexLength = bytes.Uint16() + header.dataLength = bytes.Uint16() + header.index3NullOffset = bytes.Uint16() + header.dataNullOffset = bytes.Uint16() + header.shiftedHighStart = bytes.Uint16() + + typeInt := (header.options >> 6) & 3 + valueWidthInt := header.options & optionsValueBitsMask + if typeInt > uint16(typeSmall) || valueWidthInt > uint16(valueBits8) || + (header.options&optionsReservedMask) != 0 { + return nil, errors.New("invalid options for serialized UcpTrie") + } + actualType := ucpTrieType(typeInt) + actualValueWidth := ucpTrieValueWidth(valueWidthInt) + + trie := &UcpTrie{ + indexLength: int32(header.indexLength), + dataLength: int32(((header.options & optionsDataLengthMask) << 4) | header.dataLength), + index3NullOffset: header.index3NullOffset, + dataNullOffset: int32(((header.options & optionsDataNullOffsetMask) << 8) | header.dataNullOffset), + highStart: rune(header.shiftedHighStart) << ucpShift2, + typ: actualType, + valueWidth: actualValueWidth, + } + nullValueOffset := trie.dataNullOffset + if nullValueOffset >= trie.dataLength { + nullValueOffset = trie.dataLength - highValueNegDataOffset + } + + trie.shifted12HighStart = uint16((trie.highStart + 0xfff) >> 12) + trie.index = bytes.Uint16Slice(int32(header.indexLength)) + switch actualValueWidth { + case valueBits16: + trie.data16 = bytes.Uint16Slice(trie.dataLength) + trie.nullValue = uint32(trie.data16[nullValueOffset]) + case valueBits32: + trie.data32 = bytes.Uint32Slice(trie.dataLength) + trie.nullValue = trie.data32[nullValueOffset] + case valueBits8: + trie.data8 = bytes.Uint8Slice(trie.dataLength) + trie.nullValue = uint32(trie.data8[nullValueOffset]) + } + + return trie, nil +} + +func (t *UcpTrie) Get(c rune) uint32 { + var dataIndex int32 + if c <= 0x7f { + // linear ASCII + dataIndex = c + } else { + var fastMax rune + if t.typ == typeFast { + fastMax = 0xffff + } else { + fastMax = smallMax + } + dataIndex = t.cpIndex(fastMax, c) + } + return t.getValue(dataIndex) +} + +func (t *UcpTrie) getValue(dataIndex int32) uint32 { + switch t.valueWidth { + case valueBits16: + return uint32(t.data16[dataIndex]) + case valueBits32: + return t.data32[dataIndex] + case valueBits8: + return uint32(t.data8[dataIndex]) + default: + // Unreachable if the trie is properly initialized. + return 0xffffffff + } +} + +/** Internal trie getter for a code point below the fast limit. Returns the data index. @internal */ +func (t *UcpTrie) fastIndex(c rune) int32 { + return int32(t.index[c>>fastShift]) + (c & fastDataMask) +} + +/** Internal trie getter for a code point at or above the fast limit. Returns the data index. @internal */ +func (t *UcpTrie) smallIndex(c rune) int32 { + if c >= t.highStart { + return t.dataLength - highValueNegDataOffset + } + return t.internalSmallIndex(c) +} + +func (t *UcpTrie) internalSmallIndex(c rune) int32 { + i1 := c >> ucpShift1 + if t.typ == typeFast { + i1 += bmpIndexLength - ucpOmittedBmpIndex1Length + } else { + i1 += smallIndexLength + } + i3Block := int32(t.index[int32(t.index[i1])+((c>>ucpShift2)&ucpIndex2Mask)]) + i3 := (c >> ucpShift3) & ucpIndex3Mask + var dataBlock int32 + if (i3Block & 0x8000) == 0 { + // 16-bit indexes + dataBlock = int32(t.index[i3Block+i3]) + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + i3Block = (i3Block & 0x7fff) + (i3 & ^7) + (i3 >> 3) + i3 &= 7 + dataBlock = int32(t.index[i3Block]) << (2 + (2 * i3)) & 0x30000 + i3Block++ + dataBlock |= int32(t.index[i3Block+i3]) + } + return dataBlock + (c & ucpSmallDataMask) +} + +/** + * Internal trie getter for a code point, with checking that c is in U+0000..10FFFF. + * Returns the data index. + * @internal + */ +func (t *UcpTrie) cpIndex(fastMax, c rune) int32 { + if c <= fastMax { + return t.fastIndex(c) + } + if c <= 0x10ffff { + return t.smallIndex(c) + } + return t.dataLength - errorValueNegDataOffset +} + +/** + * Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates. + * Most users should use UCPMAP_RANGE_NORMAL. + * + * @see ucpmap_getRange + * @see ucptrie_getRange + * @see umutablecptrie_getRange + * @stable ICU 63 + */ +type UcpMapRangeOption int8 + +const ( + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map. + * Most users should use this option. + * @stable ICU 63 + */ + UcpMapRangeNormal UcpMapRangeOption = iota + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map, + * except that lead surrogates (U+D800..U+DBFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See U_IS_LEAD(c). + * + * Most users should use UCPMAP_RANGE_NORMAL instead. + * + * This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + * @stable ICU 63 + */ + UcpMapRangeFixedLeadSurrogates + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map, + * except that all surrogates (U+D800..U+DFFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See U_IS_SURROGATE(c). + * + * Most users should use UCPMAP_RANGE_NORMAL instead. + * + * This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + * @stable ICU 63 + */ + UcpMapRangeFixedAllSurrogates +) + +/** + * Callback function type: Modifies a map value. + * Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange(). + * The modified value will be returned by the getRange function. + * + * Can be used to ignore some of the value bits, + * make a filter for one of several values, + * return a value index computed from the map value, etc. + * + * @param context an opaque pointer, as passed into the getRange function + * @param value a value from the map + * @return the modified value + * @stable ICU 63 + */ +type UcpMapValueFilter func(value uint32) uint32 + +/** + * GetRange returns the last code point such that all those from start to there have the same value. + * Can be used to efficiently iterate over all same-value ranges in a trie. + * (This is normally faster than iterating over code points and get()ting each value, + * but much slower than a data structure that stores ranges directly.) + * + * If the UCPMapValueFilter function pointer is not NULL, then + * the value to be delivered is passed through that function, and the return value is the end + * of the range where all values are modified to the same actual value. + * The value is unchanged if that function pointer is NULL. + * + * Example: + * \code + * UChar32 start = 0, end; + * uint32_t value; + * while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, + * NULL, NULL, &value)) >= 0) { + * // Work with the range start..end and its value. + * start = end + 1; + * } + * \endcode + * + * @param trie the trie + * @param start range start + * @param option defines whether surrogates are treated normally, + * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL + * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL + * @param filter a pointer to a function that may modify the trie data value, + * or NULL if the values from the trie are to be used unmodified + * @param context an opaque pointer that is passed on to the filter function + * @param pValue if not NULL, receives the value that every code point start..end has; + * may have been modified by filter(context, trie value) + * if that function pointer is not NULL + * @return the range end code point, or -1 if start is not a valid code point + * @stable ICU 63 + */ +func (t *UcpTrie) GetRange(start rune, option UcpMapRangeOption, surrogateValue uint32, filter UcpMapValueFilter) (rune, uint32) { + if option == UcpMapRangeNormal { + return t.getRange(start, filter) + } + + var surrEnd rune + if option == UcpMapRangeFixedAllSurrogates { + surrEnd = 0xdfff + } else { + surrEnd = 0xdbff + } + end, value := t.getRange(start, filter) + if end < 0xd7ff || start > surrEnd { + return end, value + } + if value == surrogateValue { + if end >= surrEnd { + // Surrogates followed by a non-surrogateValue range, + // or surrogates are part of a larger surrogateValue range. + return end, value + } + } else { + if start <= 0xd7ff { + return 0xd7ff, value // Non-surrogateValue range ends before surrogateValue surrogates. + } + // Start is a surrogate with a non-surrogateValue code *unit* value. + // Return a surrogateValue code *point* range. + value = surrogateValue + if end > surrEnd { + return surrEnd, value // Surrogate range ends before non-surrogateValue rest of range. + } + } + // See if the surrogateValue surrogate range can be merged with + // an immediately following range. + end2, value2 := t.getRange(surrEnd+1, filter) + if value2 == surrogateValue { + return end2, value + } + return surrEnd, value +} + +const maxUnicode = 0x10ffff + +func (t *UcpTrie) getRange(start rune, filter UcpMapValueFilter) (rune, uint32) { + if start > maxUnicode { + return -1, 0 + } + + if start >= t.highStart { + di := t.dataLength - highValueNegDataOffset + value := t.getValue(di) + if filter != nil { + value = filter(value) + } + return maxUnicode, value + } + + nullValue := t.nullValue + if filter != nil { + nullValue = filter(nullValue) + } + index := t.index + + prevI3Block := int32(-1) + prevBlock := int32(-1) + c := start + var trieValue uint32 + value := nullValue + haveValue := false + for { + var i3Block, i3, i3BlockLength, dataBlockLength int32 + if c <= 0xffff && (t.typ == typeFast || c <= smallMax) { + i3Block = 0 + i3 = c >> fastShift + if t.typ == typeFast { + i3BlockLength = bmpIndexLength + } else { + i3BlockLength = smallIndexLength + } + dataBlockLength = fastDataBlockLength + } else { + // Use the multi-stage index. + i1 := c >> ucpShift1 + if t.typ == typeFast { + i1 += bmpIndexLength - ucpOmittedBmpIndex1Length + } else { + i1 += smallIndexLength + } + shft := c >> ucpShift2 + idx := int32(t.index[i1]) + (shft & ucpIndex2Mask) + i3Block = int32(t.index[idx]) + if i3Block == prevI3Block && (c-start) >= ucpCpPerIndex2Entry { + // The index-3 block is the same as the previous one, and filled with value. + c += ucpCpPerIndex2Entry + continue + } + prevI3Block = i3Block + if i3Block == int32(t.index3NullOffset) { + // This is the index-3 null block. + if haveValue { + if nullValue != value { + return c - 1, value + } + } else { + trieValue = t.nullValue + value = nullValue + haveValue = true + } + prevBlock = t.dataNullOffset + c = (c + ucpCpPerIndex2Entry) & ^(ucpCpPerIndex2Entry - 1) + continue + } + i3 = (c >> ucpShift3) & ucpIndex3Mask + i3BlockLength = ucpIndex3BlockLength + dataBlockLength = ucpSmallDataBlockLength + } + + // Enumerate data blocks for one index-3 block. + for { + var block int32 + if (i3Block & 0x8000) == 0 { + block = int32(index[i3Block+i3]) + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + group := (i3Block & 0x7fff) + (i3 & ^7) + (i3 >> 3) + gi := i3 & 7 + block = (int32(index[group]) << (2 + (2 * gi))) & 0x30000 + group++ + block |= int32(index[group+gi]) + } + if block == prevBlock && (c-start) >= dataBlockLength { + // The block is the same as the previous one, and filled with value. + c += dataBlockLength + } else { + dataMask := dataBlockLength - 1 + prevBlock = block + if block == t.dataNullOffset { + // This is the data null block. + if haveValue { + if nullValue != value { + return c - 1, value + } + } else { + trieValue = t.nullValue + value = nullValue + haveValue = true + } + c = (c + dataBlockLength) & ^dataMask + } else { + di := block + (c & dataMask) + trieValue2 := t.getValue(di) + if haveValue { + if trieValue2 != trieValue { + if filter == nil || maybeFilterValue(trieValue2, t.nullValue, nullValue, filter) != value { + return c - 1, value + } + trieValue = trieValue2 // may or may not help + } + } else { + trieValue = trieValue2 + value = maybeFilterValue(trieValue2, t.nullValue, nullValue, filter) + haveValue = true + } + for { + c++ + if c&dataMask == 0 { + break + } + di++ + trieValue2 = t.getValue(di) + if trieValue2 != trieValue { + if filter == nil || maybeFilterValue(trieValue2, t.nullValue, nullValue, filter) != value { + return c - 1, value + } + trieValue = trieValue2 // may or may not help + } + } + } + } + i3++ + if i3 >= i3BlockLength { + break + } + } + if c >= t.highStart { + break + } + } + + di := t.dataLength - highValueNegDataOffset + highValue := t.getValue(di) + if maybeFilterValue(highValue, t.nullValue, nullValue, filter) != value { + return c - 1, value + } + return maxUnicode, value +} + +func maybeFilterValue(value uint32, trieNullValue uint32, nullValue uint32, filter UcpMapValueFilter) uint32 { + if value == trieNullValue { + value = nullValue + } else if filter != nil { + value = filter(value) + } + return value +} diff --git a/go/mysql/icuregex/internal/utrie/utrie2.go b/go/mysql/icuregex/internal/utrie/utrie2.go new file mode 100644 index 00000000000..a2c80cf1c50 --- /dev/null +++ b/go/mysql/icuregex/internal/utrie/utrie2.go @@ -0,0 +1,440 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utrie + +import ( + "errors" + "fmt" + + "vitess.io/vitess/go/mysql/icuregex/internal/udata" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +type UTrie2 struct { + index []uint16 + data16 []uint16 + data32 []uint32 + + indexLength, dataLength int + index2NullOffset uint16 + dataNullOffset uint16 + InitialValue uint32 + ErrorValue uint32 + + HighStart rune + HighValueIndex int +} + +func (t *UTrie2) SerializedLength() int32 { + return 16 + int32(t.indexLength+t.dataLength)*2 +} + +func (t *UTrie2) getIndex(asciiOffset int, c rune) uint16 { + return t.index[t.indexFromCp(asciiOffset, c)] +} + +func (t *UTrie2) Get16(c rune) uint16 { + return t.getIndex(t.indexLength, c) +} + +func (t *UTrie2) indexFromCp(asciiOffset int, c rune) int { + switch { + case c < 0xd800: + return indexRaw(0, t.index, c) + case c <= 0xffff: + var offset int32 + if c <= 0xdbff { + offset = lscpIndex2Offset - (0xd800 >> shift2) + } + return indexRaw(offset, t.index, c) + case c > 0x10ffff: + return asciiOffset + badUtf8DataOffset + case c >= t.HighStart: + return t.HighValueIndex + default: + return indexFromSupp(t.index, c) + } +} + +type EnumRange func(start, end rune, value uint32) bool +type EnumValue func(value uint32) uint32 + +func (t *UTrie2) Enum(enumValue EnumValue, enumRange EnumRange) { + t.enumEitherTrie(0, 0x110000, enumValue, enumRange) +} + +func enumSameValue(value uint32) uint32 { + return value +} + +func min(a, b rune) rune { + if a < b { + return a + } + return b +} + +func (t *UTrie2) enumEitherTrie(start, limit rune, enumValue EnumValue, enumRange EnumRange) { + if enumRange == nil { + return + } + if enumValue == nil { + enumValue = enumSameValue + } + + /* frozen trie */ + var ( + idx = t.index + data32 = t.data32 + index2NullOffset = int(t.index2NullOffset) + nullBlock = int(t.dataNullOffset) + + c rune + prev = start + highStart = t.HighStart + + /* get the enumeration value that corresponds to an initial-value trie data entry */ + initialValue = enumValue(t.InitialValue) + + /* set variables for previous range */ + i2Block int + block int + prevI2Block = -1 + prevBlock = -1 + prevValue = uint32(0) + ) + + /* enumerate index-2 blocks */ + for c = start; c < limit && c < highStart; { + /* Code point limit for iterating inside this i2Block. */ + tempLimit := c + cpPerIndex1Entry + if limit < tempLimit { + tempLimit = limit + } + if c <= 0xffff { + if !utf16.IsSurrogate(c) { + i2Block = int(c >> shift2) + } else if utf16.IsSurrogateLead(c) { + /* + * Enumerate values for lead surrogate code points, not code units: + * This special block has half the normal length. + */ + i2Block = lscpIndex2Offset + tempLimit = min(0xdc00, limit) + } else { + /* + * Switch back to the normal part of the index-2 table. + * Enumerate the second half of the surrogates block. + */ + i2Block = 0xd800 >> shift2 + tempLimit = min(0xe000, limit) + } + } else { + /* supplementary code points */ + i2Block = int(idx[(index1Offset-omittedBmpIndex1Length)+(c>>shift1)]) + if i2Block == prevI2Block && (c-prev) >= cpPerIndex1Entry { + /* + * The index-2 block is the same as the previous one, and filled with prevValue. + * Only possible for supplementary code points because the linear-BMP index-2 + * table creates unique i2Block values. + */ + c += cpPerIndex1Entry + continue + } + } + prevI2Block = i2Block + if i2Block == index2NullOffset { + /* this is the null index-2 block */ + if prevValue != initialValue { + if prev < c && !enumRange(prev, c-1, prevValue) { + return + } + prevBlock = nullBlock + prev = c + prevValue = initialValue + } + c += cpPerIndex1Entry + } else { + /* enumerate data blocks for one index-2 block */ + var i2Limit int + if (c >> shift1) == (tempLimit >> shift1) { + i2Limit = int(tempLimit>>shift2) & index2Mask + } else { + i2Limit = index2BlockLength + } + for i2 := int(c>>shift2) & index2Mask; i2 < i2Limit; i2++ { + block = int(idx[i2Block+i2] << indexShift) + if block == prevBlock && (c-prev) >= dataBlockLength { + /* the block is the same as the previous one, and filled with prevValue */ + c += dataBlockLength + continue + } + prevBlock = block + if block == nullBlock { + /* this is the null data block */ + if prevValue != initialValue { + if prev < c && !enumRange(prev, c-1, prevValue) { + return + } + prev = c + prevValue = initialValue + } + c += dataBlockLength + } else { + for j := 0; j < dataBlockLength; j++ { + var value uint32 + if data32 != nil { + value = data32[block+j] + } else { + value = uint32(idx[block+j]) + } + value = enumValue(value) + if value != prevValue { + if prev < c && !enumRange(prev, c-1, prevValue) { + return + } + prev = c + prevValue = value + } + c++ + } + } + } + } + } + + if c > limit { + c = limit /* could be higher if in the index2NullOffset */ + } else if c < limit { + /* c==highStart>shift1)]) + return (int(index[i1+int((c>>shift2)&index2Mask)]) << indexShift) + int(c&dataMask) +} + +func indexRaw(offset int32, index []uint16, c rune) int { + return int(index[offset+(c>>shift2)]<> shift1 + + /** Number of code points per index-1 table entry. 2048=0x800 */ + cpPerIndex1Entry = 1 << shift1 + + /** Number of entries in an index-2 block. 64=0x40 */ + index2BlockLength = 1 << shift1min2 + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + index2Mask = index2BlockLength - 1 + + /** Number of entries in a data block. 32=0x20 */ + dataBlockLength = 1 << shift2 + + /** Mask for getting the lower bits for the in-data-block offset. */ + dataMask = dataBlockLength - 1 + + /** + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. + */ + indexShift = 2 + + /** The alignment size of a data block. Also the granularity for compaction. */ + dataGranularity = 1 << indexShift + + /* Fixed layout of the first part of the index array. ------------------- */ + + /** + * The part of the index-2 table for U+D800..U+DBFF stores values for + * lead surrogate code _units_ not code _points_. + * Values for lead surrogate code _points_ are indexed with this portion of the table. + * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) + */ + lscpIndex2Offset = 0x10000 >> shift2 + lscpIndex2Length = 0x400 >> shift2 + + /** Count the lengths of both BMP pieces. 2080=0x820 */ + index2BmpLength = lscpIndex2Offset + lscpIndex2Length + + /** + * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. + * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. + */ + utf82BIndex2Offset = index2BmpLength + utf82BIndex2Length = 0x800 >> 6 /* U+0800 is the first code point after 2-byte UTF-8 */ + + /** + * The index-1 table, only used for supplementary code points, at offset 2112=0x840. + * Variable length, for code points up to highStart, where the last single-value range starts. + * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. + * (For 0x100000 supplementary code points U+10000..U+10ffff.) + * + * The part of the index-2 table for supplementary code points starts + * after this index-1 table. + * + * Both the index-1 table and the following part of the index-2 table + * are omitted completely if there is only BMP data. + */ + index1Offset = utf82BIndex2Offset + utf82BIndex2Length + maxIndex1Length = 0x100000 >> shift1 + + /* + * Fixed layout of the first part of the data array. ----------------------- + * Starts with 4 blocks (128=0x80 entries) for ASCII. + */ + + /** + * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. + * Used with linear access for single bytes 0..0xbf for simple error handling. + * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. + */ + badUtf8DataOffset = 0x80 +) + +func UTrie2FromBytes(bytes *udata.Bytes) (*UTrie2, error) { + type utrie2Header struct { + /** "Tri2" in big-endian US-ASCII (0x54726932) */ + signature uint32 + + /** + * options bit field: + * 15.. 4 reserved (0) + * 3.. 0 UTrie2ValueBits valueBits + */ + options uint16 + + /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ + indexLength uint16 + + /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ + shiftedDataLength uint16 + + /** Null index and data blocks, not shifted. */ + index2NullOffset, dataNullOffset uint16 + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UTRIE2_SHIFT_1. + */ + shiftedHighStart uint16 + } + + var header utrie2Header + header.signature = bytes.Uint32() + + switch header.signature { + case 0x54726932: + case 0x32697254: + return nil, errors.New("unsupported: BigEndian encoding") + default: + return nil, fmt.Errorf("invalid signature for Trie2: 0x%08x", header.signature) + } + + header.options = bytes.Uint16() + header.indexLength = bytes.Uint16() + header.shiftedDataLength = bytes.Uint16() + header.index2NullOffset = bytes.Uint16() + header.dataNullOffset = bytes.Uint16() + header.shiftedHighStart = bytes.Uint16() + + var width int + switch header.options & 0xf { + case 0: + width = 16 + case 1: + width = 32 + default: + return nil, errors.New("invalid width for serialized UTrie2") + } + + trie := &UTrie2{ + indexLength: int(header.indexLength), + dataLength: int(header.shiftedDataLength) << indexShift, + index2NullOffset: header.index2NullOffset, + dataNullOffset: header.dataNullOffset, + HighStart: rune(header.shiftedHighStart) << shift1, + } + + trie.HighValueIndex = trie.dataLength - dataGranularity + if width == 16 { + trie.HighValueIndex += trie.indexLength + } + + indexArraySize := trie.indexLength + if width == 16 { + indexArraySize += trie.dataLength + } + + trie.index = bytes.Uint16Slice(int32(indexArraySize)) + + if width == 16 { + trie.data16 = trie.index[trie.indexLength:] + trie.InitialValue = uint32(trie.index[trie.dataNullOffset]) + trie.ErrorValue = uint32(trie.index[trie.indexLength+badUtf8DataOffset]) + } else { + trie.data32 = bytes.Uint32Slice(int32(trie.dataLength)) + trie.InitialValue = trie.data32[trie.dataNullOffset] + trie.ErrorValue = trie.data32[badUtf8DataOffset] + } + + return trie, nil +} diff --git a/go/mysql/icuregex/matcher.go b/go/mysql/icuregex/matcher.go new file mode 100644 index 00000000000..11fbc152d73 --- /dev/null +++ b/go/mysql/icuregex/matcher.go @@ -0,0 +1,1655 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "fmt" + "io" + + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/uchar" + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" +) + +const timerInitialValue = 10000 +const defaultTimeout = 3 +const defaultStackLimit = 0 + +type Matcher struct { + pattern *Pattern + + input []rune + + regionStart int // Start of the input region, default = 0. + regionLimit int // End of input region, default to input.length. + + anchorStart int // Region bounds for anchoring operations (^ or $). + anchorLimit int // See useAnchoringBounds + + lookStart int // Region bounds for look-ahead/behind and + lookLimit int // and other boundary tests. See + // useTransparentBounds + + activeStart int // Currently active bounds for matching. + activeLimit int // Usually is the same as region, but + // is changed to fLookStart/Limit when + // entering look around regions. + + match bool // True if the last attempted match was successful. + matchStart int // Position of the start of the most recent match + matchEnd int // First position after the end of the most recent match + // Zero if no previous match, even when a region + // is active. + lastMatchEnd int // First position after the end of the previous match, + // or -1 if there was no previous match. + appendPosition int // First position after the end of the previous + // appendReplacement(). As described by the + // JavaDoc for Java Matcher, where it is called + // "append position" + hitEnd bool // True if the last match touched the end of input. + requireEnd bool // True if the last match required end-of-input + // (matched $ or Z) + + stack stack + frame stackFrame // After finding a match, the last active stack frame, + // which will contain the capture group results. + // NOT valid while match engine is running. + + data []int // Data area for use by the compiled pattern. + + timeLimit int32 // Max time (in arbitrary steps) to let the + // match engine run. Zero for unlimited. + + time int32 // Match time, accumulates while matching. + tickCounter int32 // Low bits counter for time. Counts down StateSaves. + // Kept separately from fTime to keep as much + // code as possible out of the inline + // StateSave function. + + dumper io.Writer +} + +func NewMatcher(pat *Pattern) *Matcher { + m := &Matcher{ + pattern: pat, + data: make([]int, pat.dataSize), + stack: stack{ + frameSize: pat.frameSize, + stackLimit: defaultStackLimit, + }, + timeLimit: defaultTimeout, + } + m.reset() + return m +} + +func (m *Matcher) MatchAt(startIdx int, toEnd bool) error { + //-------------------------------------------------------------------------------- + // + // MatchAt This is the actual matching engine. + // + // startIdx: begin matching a this index. + // toEnd: if true, match must extend to end of the input region + // + //-------------------------------------------------------------------------------- + var err error + var isMatch bool // True if the we have a match. + + if m.dumper != nil { + fmt.Fprintf(m.dumper, "MatchAt(startIdx=%d)\n", startIdx) + fmt.Fprintf(m.dumper, "Original Pattern: \"%s\"\n", m.pattern.pattern) + fmt.Fprintf(m.dumper, "Input String: \"%s\"\n\n", string(m.input)) + } + + pat := m.pattern.compiledPat + inputText := m.input + litText := m.pattern.literalText + sets := m.pattern.sets + + fp := m.resetStack() + *fp.inputIdx() = startIdx + *fp.patIdx() = 0 + for i := 0; i < len(m.data); i++ { + m.data[i] = 0 + } + + for { + op := pat[*fp.patIdx()] + + if m.dumper != nil { + fmt.Fprintf(m.dumper, "inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", *fp.inputIdx(), + charAt(inputText, *fp.inputIdx()), m.stack.sp(), m.activeLimit) + m.pattern.dumpOp(m.dumper, *fp.patIdx()) + } + + *fp.patIdx()++ + + switch op.typ() { + case urxNop: + // Nothing to do. + case urxBacktrack: + // Force a backtrack. In some circumstances, the pattern compiler + // will notice that the pattern can't possibly match anything, and will + // emit one of these at that point. + fp = m.stack.popFrame() + case urxOnechar: + if *fp.inputIdx() < m.activeLimit { + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + if c == rune(op.value()) { + break + } + } else { + m.hitEnd = true + } + fp = m.stack.popFrame() + case urxString: + // Test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + stringStartIdx := op.value() + nextOp := pat[*fp.patIdx()] // Fetch the second operand + *fp.patIdx()++ + stringLen := nextOp.value() + + patternString := litText[stringStartIdx:] + var patternStringIndex int + success := true + for patternStringIndex < stringLen { + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + success = false + break + } + if charAt(patternString, patternStringIndex) != charAt(inputText, *fp.inputIdx()) { + success = false + break + } + patternStringIndex++ + *fp.inputIdx()++ + } + + if !success { + fp = m.stack.popFrame() + } + case urxStateSave: + fp, err = m.stateSave(*fp.inputIdx(), op.value()) + if err != nil { + return err + } + case urxEnd: + // The match loop will exit via this path on a successful match, + // when we reach the end of the pattern. + if toEnd && *fp.inputIdx() != m.activeLimit { + // The pattern matched, but not to the end of input. Try some more. + fp = m.stack.popFrame() + break + } + isMatch = true + goto breakFromLoop + + // Start and End Capture stack frame variables are laid out out like this: + // fp->fExtra[opValue] - The start of a completed capture group + // opValue+1 - The end of a completed capture group + // opValue+2 - the start of a capture group whose end + // has not yet been reached (and might not ever be). + case urxStartCapture: + *fp.extra(op.value() + 2) = *fp.inputIdx() + case urxEndCapture: + *fp.extra(op.value()) = *fp.extra(op.value() + 2) // Tentative start becomes real. + *fp.extra(op.value() + 1) = *fp.inputIdx() // End position + + case urxDollar: // $, test for End of line + if *fp.inputIdx() < m.anchorLimit-2 { + fp = m.stack.popFrame() + break + } + // or for position before new line at end of input + if *fp.inputIdx() >= m.anchorLimit { + // We really are at the end of input. Success. + m.hitEnd = true + m.requireEnd = true + break + } + + if *fp.inputIdx() == m.anchorLimit-1 { + c := m.input[*fp.inputIdx()] + if isLineTerminator(c) { + if !(c == 0x0a && *fp.inputIdx() > m.anchorStart && m.input[*fp.inputIdx()-1] == 0x0d) { + // At new-line at end of input. Success + m.hitEnd = true + m.requireEnd = true + break + } + } + } else if *fp.inputIdx() == m.anchorLimit-2 && m.input[*fp.inputIdx()] == 0x0d && m.input[*fp.inputIdx()+1] == 0x0a { + m.hitEnd = true + m.requireEnd = true + break // At CR/LF at end of input. Success + } + fp = m.stack.popFrame() + + case urxDollarD: // $, test for End of Line, in UNIX_LINES mode. + if *fp.inputIdx() >= m.anchorLimit { + // Off the end of input. Success. + m.hitEnd = true + m.requireEnd = true + break + } + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + // Either at the last character of input, or off the end. + if c == 0x0a && *fp.inputIdx() == m.anchorLimit { + m.hitEnd = true + m.requireEnd = true + break + } + + // Not at end of input. Back-track out. + fp = m.stack.popFrame() + case urxDollarM: // $, test for End of line in multi-line mode + if *fp.inputIdx() >= m.anchorLimit { + // We really are at the end of input. Success. + m.hitEnd = true + m.requireEnd = true + break + } + // If we are positioned just before a new-line, succeed. + // It makes no difference where the new-line is within the input. + c := charAt(inputText, *fp.inputIdx()) + if isLineTerminator(c) { + // At a line end, except for the odd chance of being in the middle of a CR/LF sequence + // In multi-line mode, hitting a new-line just before the end of input does not + // set the hitEnd or requireEnd flags + if !(c == 0x0a && *fp.inputIdx() > m.anchorStart && charAt(inputText, *fp.inputIdx()-1) == 0x0d) { + break + } + } + // not at a new line. Fail. + fp = m.stack.popFrame() + case urxDollarMd: // $, test for End of line in multi-line and UNIX_LINES mode + if *fp.inputIdx() >= m.anchorLimit { + // We really are at the end of input. Success. + m.hitEnd = true + m.requireEnd = true // Java set requireEnd in this case, even though + break // adding a new-line would not lose the match. + } + // If we are not positioned just before a new-line, the test fails; backtrack out. + // It makes no difference where the new-line is within the input. + if charAt(inputText, *fp.inputIdx()) != 0x0a { + fp = m.stack.popFrame() + } + case urxCaret: // ^, test for start of line + if *fp.inputIdx() != m.anchorStart { + fp = m.stack.popFrame() + } + case urxCaretM: // ^, test for start of line in mulit-line mode + if *fp.inputIdx() == m.anchorStart { + // We are at the start input. Success. + break + } + // Check whether character just before the current pos is a new-line + // unless we are at the end of input + c := charAt(inputText, *fp.inputIdx()-1) + if (*fp.inputIdx() < m.anchorLimit) && isLineTerminator(c) { + // It's a new-line. ^ is true. Success. + // TODO: what should be done with positions between a CR and LF? + break + } + // Not at the start of a line. Fail. + fp = m.stack.popFrame() + case urxCaretMUnix: // ^, test for start of line in mulit-line + Unix-line mode + if *fp.inputIdx() <= m.anchorStart { + // We are at the start input. Success. + break + } + + c := charAt(inputText, *fp.inputIdx()-1) + if c != 0x0a { + // Not at the start of a line. Back-track out. + fp = m.stack.popFrame() + } + case urxBackslashB: // Test for word boundaries + success := m.isWordBoundary(*fp.inputIdx()) + success = success != (op.value() != 0) // flip sense for \B + if !success { + fp = m.stack.popFrame() + } + case urxBackslashBu: // Test for word boundaries, Unicode-style + success := m.isUWordBoundary(*fp.inputIdx()) + success = success != (op.value() != 0) // flip sense for \B + if !success { + fp = m.stack.popFrame() + } + case urxBackslashD: // Test for decimal digit + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + + success := m.isDecimalDigit(c) + success = success != (op.value() != 0) // flip sense for \D + if success { + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case urxBackslashG: // Test for position at end of previous match + if !((m.match && *fp.inputIdx() == m.matchEnd) || (!m.match && *fp.inputIdx() == m.activeStart)) { + fp = m.stack.popFrame() + } + + case urxBackslashH: // Test for \h, horizontal white space. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + success := m.isHorizWS(c) || c == 9 + success = success != (op.value() != 0) // flip sense for \H + if success { + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case urxBackslashR: // Test for \R, any line break sequence. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + c := charAt(inputText, *fp.inputIdx()) + if isLineTerminator(c) { + if c == 0x0d && charAt(inputText, *fp.inputIdx()+1) == 0x0a { + *fp.inputIdx()++ + } + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case urxBackslashV: // \v, any single line ending character. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + c := charAt(inputText, *fp.inputIdx()) + success := isLineTerminator(c) + success = success != (op.value() != 0) // flip sense for \V + if success { + *fp.inputIdx()++ + } else { + fp = m.stack.popFrame() + } + + case urxBackslashX: + // Match a Grapheme, as defined by Unicode UAX 29. + + // Fail if at end of input + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + *fp.inputIdx() = m.followingGCBoundary(*fp.inputIdx()) + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + *fp.inputIdx() = m.activeLimit + } + + case urxBackslashZ: // Test for end of Input + if *fp.inputIdx() < m.anchorLimit { + fp = m.stack.popFrame() + } else { + m.hitEnd = true + m.requireEnd = true + } + case urxStaticSetref: + // Test input character against one of the predefined sets + // (Word Characters, for example) + // The high bit of the op value is a flag for the match polarity. + // 0: success if input char is in set. + // 1: success if input char is not in set. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + success := (op.value() & urxNegSet) == urxNegSet + negOp := op.value() & ^urxNegSet + + c := charAt(inputText, *fp.inputIdx()) + s := staticPropertySets[negOp] + if s.ContainsRune(c) { + success = !success + } + + if success { + *fp.inputIdx()++ + } else { + // the character wasn't in the set. + fp = m.stack.popFrame() + } + case urxStatSetrefN: + // Test input character for NOT being a member of one of + // the predefined sets (Word Characters, for example) + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + s := staticPropertySets[op.value()] + if !s.ContainsRune(c) { + *fp.inputIdx()++ + break + } + // the character wasn't in the set. + fp = m.stack.popFrame() + + case urxSetref: + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + // There is input left. Pick up one char and test it for set membership. + c := charAt(inputText, *fp.inputIdx()) + + s := sets[op.value()] + if s.ContainsRune(c) { + *fp.inputIdx()++ + break + } + + // the character wasn't in the set. + fp = m.stack.popFrame() + + case urxDotany: + // . matches anything, but stops at end-of-line. + if *fp.inputIdx() >= m.activeLimit { + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + if isLineTerminator(c) { + // End of line in normal mode. . does not match. + fp = m.stack.popFrame() + break + } + *fp.inputIdx()++ + + case urxDotanyAll: + // ., in dot-matches-all (including new lines) mode + if *fp.inputIdx() >= m.activeLimit { + // At end of input. Match failed. Backtrack out. + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + c := charAt(inputText, *fp.inputIdx()) + *fp.inputIdx()++ + if c == 0x0d && *fp.inputIdx() < m.activeLimit { + // In the case of a CR/LF, we need to advance over both. + nextc := charAt(inputText, *fp.inputIdx()) + if nextc == 0x0a { + *fp.inputIdx()++ + } + } + + case urxDotanyUnix: + // '.' operator, matches all, but stops at end-of-line. + // UNIX_LINES mode, so 0x0a is the only recognized line ending. + if *fp.inputIdx() >= m.activeLimit { + // At end of input. Match failed. Backtrack out. + m.hitEnd = true + fp = m.stack.popFrame() + break + } + + // There is input left. Advance over one char, unless we've hit end-of-line + c := charAt(inputText, *fp.inputIdx()) + if c == 0x0a { + // End of line in normal mode. '.' does not match the \n + fp = m.stack.popFrame() + } else { + *fp.inputIdx()++ + } + case urxJmp: + *fp.patIdx() = op.value() + + case urxFail: + isMatch = false + goto breakFromLoop + + case urxJmpSav: + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current + if err != nil { + return err + } + *fp.patIdx() = op.value() // Then JMP. + + case urxJmpSavX: + // This opcode is used with (x)+, when x can match a zero length string. + // Same as JMP_SAV, except conditional on the match having made forward progress. + // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the + // data address of the input position at the start of the loop. + stoOp := pat[op.value()-1] + frameLoc := stoOp.value() + + prevInputIdx := *fp.extra(frameLoc) + if prevInputIdx < *fp.inputIdx() { + // The match did make progress. Repeat the loop. + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) // State save to loc following current + if err != nil { + return err + } + *fp.patIdx() = op.value() // Then JMP. + *fp.extra(frameLoc) = *fp.inputIdx() + } + // If the input position did not advance, we do nothing here, + // execution will fall out of the loop. + + case urxCtrInit: + *fp.extra(op.value()) = 0 // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT has, and + // skip the pattern location counter past + instOperandLoc := *fp.patIdx() + *fp.patIdx() += 3 // Skip over the three operands that CTR_INIT has. + + loopLoc := pat[instOperandLoc].value() + minCount := int(pat[instOperandLoc+1]) + maxCount := int(pat[instOperandLoc+2]) + + if minCount == 0 { + fp, err = m.stateSave(*fp.inputIdx(), loopLoc+1) + if err != nil { + return err + } + } + if maxCount == -1 { + *fp.extra(op.value() + 1) = *fp.inputIdx() // For loop breaking. + } else if maxCount == 0 { + fp = m.stack.popFrame() + } + + case utxCtrLoop: + initOp := pat[op.value()] + opValue := initOp.value() + pCounter := fp.extra(opValue) + minCount := int(pat[op.value()+2]) + maxCount := int(pat[op.value()+3]) + *pCounter++ + if *pCounter >= maxCount && maxCount != -1 { + break + } + + if *pCounter >= minCount { + if maxCount == -1 { + // Loop has no hard upper bound. + // Check that it is progressing through the input, break if it is not. + pLastIntputIdx := fp.extra(opValue + 1) + if *pLastIntputIdx == *fp.inputIdx() { + break + } + *pLastIntputIdx = *fp.inputIdx() + } + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + } else { + // Increment time-out counter. (StateSave() does it if count >= minCount) + m.tickCounter-- + if m.tickCounter <= 0 { + if err = m.incrementTime(*fp.inputIdx()); err != nil { + return err + } // Re-initializes fTickCounter + } + } + + *fp.patIdx() = op.value() + 4 // Loop back. + + case urxCtrInitNg: + *fp.extra(op.value()) = 0 // Set the loop counter variable to zero + + // Pick up the three extra operands that CTR_INIT_NG has, and + // skip the pattern location counter past + instrOperandLoc := *fp.patIdx() + *fp.patIdx() += 3 + loopLoc := pat[instrOperandLoc].value() + minCount := pat[instrOperandLoc+1].value() + maxCount := pat[instrOperandLoc+2].value() + + if maxCount == -1 { + *fp.extra(op.value() + 1) = *fp.inputIdx() // Save initial input index for loop breaking. + } + + if minCount == 0 { + if maxCount != 0 { + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + } + *fp.patIdx() = loopLoc + 1 + } + + case urxCtrLoopNg: + initOp := pat[op.value()] + pCounter := fp.extra(initOp.value()) + minCount := int(pat[op.value()+2]) + maxCount := int(pat[op.value()+3]) + *pCounter++ + if *pCounter >= maxCount && maxCount != -1 { + // The loop has matched the maximum permitted number of times. + // Break out of here with no action. Matching will + // continue with the following pattern. + break + } + + if *pCounter < minCount { + // We haven't met the minimum number of matches yet. + // Loop back for another one. + *fp.patIdx() = op.value() + 4 // Loop back. + // Increment time-out counter. (StateSave() does it if count >= minCount) + m.tickCounter-- + if m.tickCounter <= 0 { + if err = m.incrementTime(*fp.inputIdx()); err != nil { + return err + } // Re-initializes fTickCounter + } + } else { + // We do have the minimum number of matches. + + // If there is no upper bound on the loop iterations, check that the input index + // is progressing, and stop the loop if it is not. + if maxCount == -1 { + lastInputIdx := fp.extra(initOp.value() + 1) + if *fp.inputIdx() == *lastInputIdx { + break + } + *lastInputIdx = *fp.inputIdx() + } + } + + // Loop Continuation: we will fall into the pattern following the loop + // (non-greedy, don't execute loop body first), but first do + // a state save to the top of the loop, so that a match failure + // in the following pattern will try another iteration of the loop. + fp, err = m.stateSave(*fp.inputIdx(), op.value()+4) + if err != nil { + return err + } + + case urxStoSp: + m.data[op.value()] = m.stack.len() + + case urxLdSp: + newStackSize := m.data[op.value()] + newFp := m.stack.offset(newStackSize) + if newFp.equals(fp) { + break + } + copy(newFp, fp) + fp = newFp + + m.stack.setSize(newStackSize) + case urxBackref: + groupStartIdx := *fp.extra(op.value()) + groupEndIdx := *fp.extra(op.value() + 1) + + if groupStartIdx < 0 { + // This capture group has not participated in the match thus far, + fp = m.stack.popFrame() // FAIL, no match. + break + } + + success := true + for { + if groupStartIdx >= groupEndIdx { + success = true + break + } + + if *fp.inputIdx() >= m.activeLimit { + success = false + m.hitEnd = true + break + } + + captureGroupChar := charAt(inputText, groupStartIdx) + inputChar := charAt(inputText, *fp.inputIdx()) + groupStartIdx++ + *fp.inputIdx()++ + if inputChar != captureGroupChar { + success = false + break + } + } + + if !success { + fp = m.stack.popFrame() + } + case urxBackrefI: + groupStartIdx := *fp.extra(op.value()) + groupEndIdx := *fp.extra(op.value() + 1) + + if groupStartIdx < 0 { + // This capture group has not participated in the match thus far, + fp = m.stack.popFrame() // FAIL, no match. + break + } + + captureGroupItr := newCaseFoldIterator(m.input, groupStartIdx, groupEndIdx) + inputItr := newCaseFoldIterator(m.input, *fp.inputIdx(), m.activeLimit) + success := true + + for { + captureGroupChar := captureGroupItr.next() + if captureGroupChar == -1 { + success = true + break + } + inputChar := inputItr.next() + if inputChar == -1 { + success = false + m.hitEnd = true + break + } + if inputChar != captureGroupChar { + success = false + break + } + } + + if success && inputItr.inExpansion() { + // We otained a match by consuming part of a string obtained from + // case-folding a single code point of the input text. + // This does not count as an overall match. + success = false + } + + if success { + *fp.inputIdx() = inputItr.index + } else { + fp = m.stack.popFrame() + } + + case urxStoInpLoc: + *fp.extra(op.value()) = *fp.inputIdx() + + case urxJmpx: + instrOperandLoc := *fp.patIdx() + *fp.patIdx()++ + dataLoc := pat[instrOperandLoc].value() + + saveInputIdx := *fp.extra(dataLoc) + + if saveInputIdx < *fp.inputIdx() { + *fp.patIdx() = op.value() // JMP + } else { + fp = m.stack.popFrame() // FAIL, no progress in loop. + } + + case urxLaStart: + m.data[op.value()] = m.stack.len() + m.data[op.value()+1] = *fp.inputIdx() + m.data[op.value()+2] = m.activeStart + m.data[op.value()+3] = m.activeLimit + m.activeStart = m.lookStart // Set the match region change for + m.activeLimit = m.lookLimit // transparent bounds. + + case urxLaEnd: + stackSize := m.stack.len() + newStackSize := m.data[op.value()] + if stackSize > newStackSize { + // Copy the current top frame back to the new (cut back) top frame. + // This makes the capture groups from within the look-ahead + // expression available. + newFp := m.stack.offset(newStackSize) + copy(newFp, fp) + fp = newFp + m.stack.setSize(newStackSize) + } + + *fp.inputIdx() = m.data[op.value()+1] + + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] + + case urcOnecharI: + // Case insensitive one char. The char from the pattern is already case folded. + // Input text is not, but case folding the input can not reduce two or more code + // points to one. + if *fp.inputIdx() < m.activeLimit { + c := charAt(inputText, *fp.inputIdx()) + if ucase.Fold(c) == op.value32() { + *fp.inputIdx()++ + break + } + } else { + m.hitEnd = true + } + + fp = m.stack.popFrame() + + case urxStringI: + // Case-insensitive test input against a literal string. + // Strings require two slots in the compiled pattern, one for the + // offset to the string text, and one for the length. + // The compiled string has already been case folded. + patternString := litText[op.value():] + var patternStringIdx int + nextOp := pat[*fp.patIdx()] + *fp.patIdx()++ + patternStringLen := nextOp.value() + + success := true + + it := newCaseFoldIterator(inputText, *fp.inputIdx(), m.activeLimit) + for patternStringIdx < patternStringLen { + cText := it.next() + cPattern := patternString[patternStringIdx] + patternStringIdx++ + + if cText != cPattern { + success = false + if cText == -1 { + m.hitEnd = true + } + break + } + } + if it.inExpansion() { + success = false + } + + if success { + *fp.inputIdx() = it.index + } else { + fp = m.stack.popFrame() + } + + case urxLbStart: + // Entering a look-behind block. + // Save Stack Ptr, Input Pos and active input region. + // TODO: implement transparent bounds. Ticket #6067 + m.data[op.value()] = m.stack.len() + m.data[op.value()+1] = *fp.inputIdx() + // Save input string length, then reset to pin any matches to end at + // the current position. + m.data[op.value()+2] = m.activeStart + m.data[op.value()+3] = m.activeLimit + m.activeStart = m.regionStart + m.activeLimit = *fp.inputIdx() + // Init the variable containing the start index for attempted matches. + m.data[op.value()+4] = -1 + case urxLbCont: + // Positive Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the min and max possible match lengths. They are the operands + // of this op in the pattern. + minML := pat[*fp.patIdx()] + *fp.patIdx()++ + maxML := pat[*fp.patIdx()] + *fp.patIdx()++ + + lbStartIdx := &m.data[op.value()+4] + if *lbStartIdx < 0 { + // First time through loop. + *lbStartIdx = *fp.inputIdx() - int(minML) + if *lbStartIdx > 0 { + *lbStartIdx = *fp.inputIdx() + } + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + *lbStartIdx-- + } + + if *lbStartIdx < 0 || *lbStartIdx < *fp.inputIdx()-int(maxML) { + // We have tried all potential match starting points without + // getting a match. Backtrack out, and out of the + // Look Behind altogether. + fp = m.stack.popFrame() + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] + break + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will fall off the end of the loop.) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()-3) + if err != nil { + return err + } + *fp.inputIdx() = *lbStartIdx + + case urxLbEnd: + // End of a look-behind block, after a successful match. + if *fp.inputIdx() != m.activeLimit { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or fail + // the look-behind altogether, whichever is appropriate. + fp = m.stack.popFrame() + break + } + + // Look-behind match is good. Restore the orignal input string region, + // which had been truncated to pin the end of the lookbehind match to the + // position being looked-behind. + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] + case urxLbnCount: + // Negative Look-Behind, at top of loop checking for matches of LB expression + // at all possible input starting positions. + + // Fetch the extra parameters of this op. + minML := pat[*fp.patIdx()] + *fp.patIdx()++ + maxML := pat[*fp.patIdx()] + *fp.patIdx()++ + + continueLoc := pat[*fp.patIdx()].value() + *fp.patIdx()++ + + lbStartIdx := &m.data[op.value()+4] + + if *lbStartIdx < 0 { + // First time through loop. + *lbStartIdx = *fp.inputIdx() - int(minML) + if *lbStartIdx > 0 { + // move index to a code point boundary, if it's not on one already. + *lbStartIdx = *fp.inputIdx() + } + } else { + // 2nd through nth time through the loop. + // Back up start position for match by one. + *lbStartIdx-- + } + + if *lbStartIdx < 0 || *lbStartIdx < *fp.inputIdx()-int(maxML) { + // We have tried all potential match starting points without + // getting a match, which means that the negative lookbehind as + // a whole has succeeded. Jump forward to the continue location + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] + *fp.patIdx() = continueLoc + break + } + + // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. + // (successful match will cause a FAIL out of the loop altogether.) + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()-4) + if err != nil { + return err + } + *fp.inputIdx() = *lbStartIdx + case urxLbnEnd: + // End of a negative look-behind block, after a successful match. + + if *fp.inputIdx() != m.activeLimit { + // The look-behind expression matched, but the match did not + // extend all the way to the point that we are looking behind from. + // FAIL out of here, which will take us back to the LB_CONT, which + // will retry the match starting at another position or succeed + // the look-behind altogether, whichever is appropriate. + fp = m.stack.popFrame() + break + } + + // Look-behind expression matched, which means look-behind test as + // a whole Fails + + // Restore the orignal input string length, which had been truncated + // inorder to pin the end of the lookbehind match + // to the position being looked-behind. + m.activeStart = m.data[op.value()+2] + m.activeLimit = m.data[op.value()+3] + + // Restore original stack position, discarding any state saved + // by the successful pattern match. + newStackSize := m.data[op.value()] + m.stack.setSize(newStackSize) + + // FAIL, which will take control back to someplace + // prior to entering the look-behind test. + fp = m.stack.popFrame() + case urxLoopSrI: + // Loop Initialization for the optimized implementation of + // [some character set]* + // This op scans through all matching input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + s := sets[op.value()] + + // Loop through input, until either the input is exhausted or + // we reach a character that is not a member of the set. + ix := *fp.inputIdx() + + for { + if ix >= m.activeLimit { + m.hitEnd = true + break + } + c := charAt(inputText, ix) + if !s.ContainsRune(c) { + break + } + ix++ + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if ix == *fp.inputIdx() { + *fp.patIdx()++ // skip the URX_LOOP_C op. + break + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this [set]* + loopcOp := pat[*fp.patIdx()] + stackLoc := loopcOp.value() + *fp.extra(stackLoc) = *fp.inputIdx() + *fp.inputIdx() = ix + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + *fp.patIdx()++ + case urxLoopDotI: + // Loop Initialization for the optimized implementation of .* + // This op scans through all remaining input. + // The following LOOP_C op emulates stack unwinding if the following pattern fails. + + // Loop through input until the input is exhausted (we reach an end-of-line) + // In DOTALL mode, we can just go straight to the end of the input. + var ix int + if (op.value() & 1) == 1 { + // Dot-matches-All mode. Jump straight to the end of the string. + ix = m.activeLimit + m.hitEnd = true + } else { + // NOT DOT ALL mode. Line endings do not match '.' + // Scan forward until a line ending or end of input. + ix = *fp.inputIdx() + for { + if ix >= m.activeLimit { + m.hitEnd = true + break + } + c := charAt(inputText, ix) + if (c & 0x7f) <= 0x29 { // Fast filter of non-new-line-s + if (c == 0x0a) || // 0x0a is newline in both modes. + (((op.value() & 2) == 0) && // IF not UNIX_LINES mode + isLineTerminator(c)) { + // char is a line ending. Exit the scanning loop. + break + } + } + ix++ + } + } + + // If there were no matching characters, skip over the loop altogether. + // The loop doesn't run at all, a * op always succeeds. + if ix == *fp.inputIdx() { + *fp.patIdx()++ // skip the URX_LOOP_C op. + break + } + + // Peek ahead in the compiled pattern, to the URX_LOOP_C that + // must follow. It's operand is the stack location + // that holds the starting input index for the match of this .* + loopcOp := pat[*fp.patIdx()] + stackLoc := loopcOp.value() + *fp.extra(stackLoc) = *fp.inputIdx() + *fp.inputIdx() = ix + + // Save State to the URX_LOOP_C op that follows this one, + // so that match failures in the following code will return to there. + // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()) + if err != nil { + return err + } + *fp.patIdx()++ + + case urxLoopC: + backSearchIndex := *fp.extra(op.value()) + + if backSearchIndex == *fp.inputIdx() { + // We've backed up the input idx to the point that the loop started. + // The loop is done. Leave here without saving state. + // Subsequent failures won't come back here. + break + } + // Set up for the next iteration of the loop, with input index + // backed up by one from the last time through, + // and a state save to this instruction in case the following code fails again. + // (We're going backwards because this loop emulates stack unwinding, not + // the initial scan forward.) + + prevC := charAt(inputText, *fp.inputIdx()-1) + *fp.inputIdx()-- + twoPrevC := charAt(inputText, *fp.inputIdx()-1) + + if prevC == 0x0a && + *fp.inputIdx() > backSearchIndex && + twoPrevC == 0x0d { + prevOp := pat[*fp.patIdx()-2] + if prevOp.typ() == urxLoopDotI { + // .*, stepping back over CRLF pair. + *fp.inputIdx()-- + } + } + + fp, err = m.stateSave(*fp.inputIdx(), *fp.patIdx()-1) + if err != nil { + return err + } + default: + // Trouble. The compiled pattern contains an entry with an + // unrecognized type tag. + panic("unreachable") + } + } + +breakFromLoop: + m.match = isMatch + if isMatch { + m.lastMatchEnd = m.matchEnd + m.matchStart = startIdx + m.matchEnd = *fp.inputIdx() + } + + if m.dumper != nil { + if isMatch { + fmt.Fprintf(m.dumper, "Match. start=%d end=%d\n\n", m.matchStart, m.matchEnd) + } else { + fmt.Fprintf(m.dumper, "No match\n\n") + } + } + + m.frame = fp // The active stack frame when the engine stopped. + // Contains the capture group results that we need to + // access later. + return nil +} + +func charAt(str []rune, idx int) rune { + if idx >= 0 && idx < len(str) { + return str[idx] + } + return -1 +} + +func (m *Matcher) isWordBoundary(pos int) bool { + cIsWord := false + + if pos >= m.lookLimit { + m.hitEnd = true + } else { + c := charAt(m.input, pos) + if uprops.HasBinaryProperty(c, uprops.UCharGraphemeExtend) || uchar.CharType(c) == uchar.FormatChar { + return false + } + cIsWord = staticPropertySets[urxIswordSet].ContainsRune(c) + } + + prevCIsWord := false + for { + if pos <= m.lookStart { + break + } + prevChar := charAt(m.input, pos-1) + pos-- + if !(uprops.HasBinaryProperty(prevChar, uprops.UCharGraphemeExtend) || uchar.CharType(prevChar) == uchar.FormatChar) { + prevCIsWord = staticPropertySets[urxIswordSet].ContainsRune(prevChar) + break + } + } + return cIsWord != prevCIsWord +} + +func (m *Matcher) isUWordBoundary(pos int) bool { + // TODO: implement + /* + UBool returnVal = FALSE; + + #if UCONFIG_NO_BREAK_ITERATION==0 + // Note: this point will never be reached if break iteration is configured out. + // Regex patterns that would require this function will fail to compile. + + // If we haven't yet created a break iterator for this matcher, do it now. + if (fWordBreakItr == nullptr) { + fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status); + if (U_FAILURE(status)) { + return FALSE; + } + fWordBreakItr->setText(fInputText, status); + } + + // Note: zero width boundary tests like \b see through transparent region bounds, + // which is why fLookLimit is used here, rather than fActiveLimit. + if (pos >= fLookLimit) { + fHitEnd = TRUE; + returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" + // words are not boundaries. All non-word chars stand by themselves, + // with word boundaries on both sides. + } else { + returnVal = fWordBreakItr->isBoundary((int32_t)pos); + } + #endif + return returnVal; + */ + return false +} + +func (m *Matcher) resetStack() stackFrame { + m.stack.reset() + frame, _ := m.stack.newFrame(0, nil, "") + frame.clearExtra() + return frame +} + +func (m *Matcher) stateSave(inputIdx, savePatIdx int) (stackFrame, error) { + // push storage for a new frame. + newFP, err := m.stack.newFrame(inputIdx, m.input, m.pattern.pattern) + if err != nil { + return nil, err + } + fp := m.stack.prevFromTop() + + // New stack frame = copy of old top frame. + copy(newFP, fp) + + m.tickCounter-- + if m.tickCounter <= 0 { + if err := m.incrementTime(*fp.inputIdx()); err != nil { + return nil, err + } + } + *fp.patIdx() = savePatIdx + return newFP, nil +} + +func (m *Matcher) incrementTime(inputIdx int) error { + m.tickCounter = timerInitialValue + m.time++ + if m.timeLimit > 0 && m.time >= m.timeLimit { + return &MatchError{ + Code: TimeOut, + Pattern: m.pattern.pattern, + Position: inputIdx, + Input: m.input, + } + } + return nil +} + +func (m *Matcher) isDecimalDigit(c rune) bool { + return uchar.IsDigit(c) +} + +func (m *Matcher) isHorizWS(c rune) bool { + return uchar.CharType(c) == uchar.SpaceSeparator || c == 9 +} + +func (m *Matcher) followingGCBoundary(pos int) int { + // TODO: implement + return pos + /* + // Note: this point will never be reached if break iteration is configured out. + // Regex patterns that would require this function will fail to compile. + + // If we haven't yet created a break iterator for this matcher, do it now. + if (m.gcBreakItr == nil) { + m.gcBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); + if (U_FAILURE(status)) { + return pos; + } + fGCBreakItr->setText(fInputText, status); + } + result = fGCBreakItr->following(pos); + if (result == BreakIterator::DONE) { + result = pos; + } + */ +} + +func (m *Matcher) ResetString(input string) { + m.Reset([]rune(input)) +} + +func (m *Matcher) Reset(input []rune) { + m.input = input + m.reset() +} + +func (m *Matcher) Matches() (bool, error) { + err := m.MatchAt(m.activeStart, true) + return m.match, err +} + +func (m *Matcher) LookingAt() (bool, error) { + err := m.MatchAt(m.activeStart, false) + return m.match, err +} + +func (m *Matcher) Find() (bool, error) { + startPos := m.matchEnd + if startPos == 0 { + startPos = m.activeStart + } + + if m.match { + // Save the position of any previous successful match. + m.lastMatchEnd = m.matchEnd + if m.matchStart == m.matchEnd { + // Previous match had zero length. Move start position up one position + // to avoid sending find() into a loop on zero-length matches. + if startPos >= m.activeLimit { + m.match = false + m.hitEnd = true + return false, nil + } + startPos++ + } + } else { + if m.lastMatchEnd >= 0 { + // A previous find() failed to match. Don't try again. + // (without this test, a pattern with a zero-length match + // could match again at the end of an input string.) + m.hitEnd = true + return false, nil + } + } + + testStartLimit := m.activeLimit - int(m.pattern.minMatchLen) + if startPos > testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + + switch m.pattern.startType { + case startNoInfo: + // No optimization was found. + // Try a match at each input position. + for { + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + if startPos >= testStartLimit { + m.hitEnd = true + return false, nil + } + startPos++ + } + case startSet: + // Match may start on any char from a pre-computed set. + for { + pos := startPos + c := charAt(m.input, startPos) + startPos++ + // c will be -1 (U_SENTINEL) at end of text, in which case we + // skip this next block (so we don't have a negative array index) + // and handle end of text in the following block. + if c >= 0 && m.pattern.initialChars.ContainsRune(c) { + err := m.MatchAt(pos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + + if startPos > testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + } + case startStart: + // Matches are only possible at the start of the input string + // (pattern begins with ^ or \A) + if startPos > m.activeStart { + m.match = false + return false, nil + } + err := m.MatchAt(startPos, false) + return m.match, err + case startLine: + var ch rune + if startPos == m.anchorStart { + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + ch = charAt(m.input, startPos) + startPos++ + } else { + ch = charAt(m.input, startPos-1) + } + + if m.pattern.flags&UnixLines != 0 { + for { + if ch == 0x0a { + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + if startPos >= testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + ch = charAt(m.input, startPos) + startPos++ + } + } else { + for { + if isLineTerminator(ch) { + if ch == 0x0d && startPos < m.activeLimit && charAt(m.input, startPos) == 0x0a { + startPos++ + } + err := m.MatchAt(startPos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + if startPos >= testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + ch = charAt(m.input, startPos) + startPos++ + } + } + case startChar, startString: + // Match starts on exactly one char. + theChar := m.pattern.initialChar + for { + pos := startPos + c := charAt(m.input, startPos) + startPos++ + if c == theChar { + err := m.MatchAt(pos, false) + if err != nil { + return false, err + } + if m.match { + return true, nil + } + } + if startPos > testStartLimit { + m.match = false + m.hitEnd = true + return false, nil + } + } + default: + panic("unreachable") + } +} + +func (m *Matcher) Start() int { + if !m.match { + return -1 + } + + return m.matchStart +} + +func (m *Matcher) reset() { + m.regionStart = 0 + m.regionLimit = len(m.input) + m.activeStart = 0 + m.activeLimit = len(m.input) + m.anchorStart = 0 + m.anchorLimit = len(m.input) + m.lookStart = 0 + m.lookLimit = len(m.input) + m.resetPreserveRegion() +} + +func (m *Matcher) resetPreserveRegion() { + m.matchStart = 0 + m.matchEnd = 0 + m.lastMatchEnd = -1 + m.appendPosition = 0 + m.match = false + m.hitEnd = false + m.requireEnd = false + m.time = 0 + m.tickCounter = timerInitialValue +} + +func (m *Matcher) GroupCount() int { + return len(m.pattern.groupMap) +} + +func (m *Matcher) StartForGroup(group int) int { + if !m.match { + return -1 + } + if group < 0 || group > len(m.pattern.groupMap) { + return -1 + } + if group == 0 { + return m.matchStart + } + groupOffset := int(m.pattern.groupMap[group-1]) + return *m.frame.extra(groupOffset) +} + +func (m *Matcher) EndForGroup(group int) int { + if !m.match { + return -1 + } + if group < 0 || group > len(m.pattern.groupMap) { + return -1 + } + if group == 0 { + return m.matchEnd + } + groupOffset := int(m.pattern.groupMap[group-1]) + return *m.frame.extra(groupOffset + 1) +} + +func (m *Matcher) HitEnd() bool { + return m.hitEnd +} + +func (m *Matcher) RequireEnd() bool { + return m.requireEnd +} + +func (m *Matcher) Group(i int) (string, bool) { + start := m.StartForGroup(i) + end := m.EndForGroup(i) + if start == -1 || end == -1 { + return "", false + } + return string(m.input[start:end]), true +} + +func (m *Matcher) End() int { + if !m.match { + return -1 + } + + return m.matchEnd +} + +func (m *Matcher) Dumper(out io.Writer) { + m.dumper = out +} + +// Test for any of the Unicode line terminating characters. +func isLineTerminator(c rune) bool { + if (c & ^(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) != 0 { + return false + } + return (c <= 0x0d && c >= 0x0a) || c == 0x85 || c == 0x2028 || c == 0x2029 +} diff --git a/go/mysql/icuregex/ops.go b/go/mysql/icuregex/ops.go new file mode 100644 index 00000000000..dbb83ee3d24 --- /dev/null +++ b/go/mysql/icuregex/ops.go @@ -0,0 +1,414 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "golang.org/x/exp/slices" + + "vitess.io/vitess/go/mysql/icuregex/internal/ucase" + "vitess.io/vitess/go/mysql/icuregex/internal/utf16" +) + +type opcode uint8 + +const ( + urxReservedOp opcode = iota // For multi-operand ops, most non-first words. + urxBacktrack // Force a backtrack, as if a match test had failed. + urxEnd + urxOnechar // Value field is the 21 bit unicode char to match + urxString // Value field is index of string start + urxStringLen // Value field is string length (code units) + urxStateSave // Value field is pattern position to push + urxNop + urxStartCapture // Value field is capture group number. + urxEndCapture // Value field is capture group number + urxStaticSetref // Value field is index of set in array of sets. + urxSetref // Value field is index of set in array of sets. + urxDotany + urxJmp // Value field is destination position in the pattern. + urxFail // Stop match operation, No match. + + urxJmpSav // Operand: JMP destination location + urxBackslashB // Value field: 0: \b 1: \B + urxBackslashG + urxJmpSavX // Conditional JMP_SAV, + // Used in (x)+, breaks loop on zero length match. + // Operand: Jmp destination. + urxBackslashX + urxBackslashZ // \z Unconditional end of line. + + urxDotanyAll // ., in the . matches any mode. + urxBackslashD // Value field: 0: \d 1: \D + urxCaret // Value field: 1: multi-line mode. + urxDollar // Also for \Z + + urxCtrInit // Counter Inits for {Interval} loops. + urxCtrInitNg // 2 kinds, normal and non-greedy. + // These are 4 word opcodes. See description. + // First Operand: Data loc of counter variable + // 2nd Operand: Pat loc of the URX_CTR_LOOPx + // at the end of the loop. + // 3rd Operand: Minimum count. + // 4th Operand: Max count, -1 for unbounded. + + urxDotanyUnix // '.' operator in UNIX_LINES mode, only \n marks end of line. + + utxCtrLoop // Loop Ops for {interval} loops. + urxCtrLoopNg // Also in three flavors. + // Operand is loc of corresponding CTR_INIT. + + urxCaretMUnix // '^' operator, test for start of line in multi-line + // plus UNIX_LINES mode. + + urxRelocOprnd // Operand value in multi-operand ops that refers + // back into compiled pattern code, and thus must + // be relocated when inserting/deleting ops in code. + + urxStoSp // Store the stack ptr. Operand is location within + // matcher data (not stack data) to store it. + urxLdSp // Load the stack pointer. Operand is location + // to load from. + urxBackref // Back Reference. Parameter is the index of the + // capture group variables in the state stack frame. + urxStoInpLoc // Store the input location. Operand is location + // within the matcher stack frame. + urxJmpx // Conditional JMP. + // First Operand: JMP target location. + // Second Operand: Data location containing an + // input position. If current input position == + // saved input position, FAIL rather than taking + // the JMP + urxLaStart // Starting a LookAround expression. + // Save InputPos, SP and active region in static data. + // Operand: Static data offset for the save + urxLaEnd // Ending a Lookaround expression. + // Restore InputPos and Stack to saved values. + // Operand: Static data offset for saved data. + urcOnecharI // Test for case-insensitive match of a literal character. + // Operand: the literal char. + urxStringI // Case insensitive string compare. + // First Operand: Index of start of string in string literals + // Second Operand (next word in compiled code): + // the length of the string. + urxBackrefI // Case insensitive back reference. + // Parameter is the index of the + // capture group variables in the state stack frame. + urxDollarM // $ in multi-line mode. + urxCaretM // ^ in multi-line mode. + urxLbStart // LookBehind Start. + // Parameter is data location + urxLbCont // LookBehind Continue. + // Param 0: the data location + // Param 1: The minimum length of the look-behind match + // Param 2: The max length of the look-behind match + urxLbEnd // LookBehind End. + // Parameter is the data location. + // Check that match ended at the right spot, + // Restore original input string len. + urxLbnCount // Negative LookBehind Continue + // Param 0: the data location + // Param 1: The minimum length of the look-behind match + // Param 2: The max length of the look-behind match + // Param 3: The pattern loc following the look-behind block. + urxLbnEnd // Negative LookBehind end + // Parameter is the data location. + // Check that the match ended at the right spot. + urxStatSetrefN // Reference to a prebuilt set (e.g. \w), negated + // Operand is index of set in array of sets. + urxLoopSrI // Init a [set]* loop. + // Operand is the sets index in array of user sets. + urxLoopC // Continue a [set]* or OneChar* loop. + // Operand is a matcher static data location. + // Must always immediately follow LOOP_x_I instruction. + urxLoopDotI // .*, initialization of the optimized loop. + // Operand value: + // bit 0: + // 0: Normal (. doesn't match new-line) mode. + // 1: . matches new-line mode. + // bit 1: controls what new-lines are recognized by this operation. + // 0: All Unicode New-lines + // 1: UNIX_LINES, \u000a only. + urxBackslashBu // \b or \B in UREGEX_UWORD mode, using Unicode style + // word boundaries. + urxDollarD // $ end of input test, in UNIX_LINES mode. + urxDollarMd // $ end of input test, in MULTI_LINE and UNIX_LINES mode. + urxBackslashH // Value field: 0: \h 1: \H + urxBackslashR // Any line break sequence. + urxBackslashV // Value field: 0: \v 1: \V + + urxReservedOpN opcode = 255 // For multi-operand ops, negative operand values. +) + +// Keep this list of opcode names in sync with the above enum +// +// Used for debug printing only. +var urxOpcodeNames = []string{ + " ", + "BACKTRACK", + "END", + "ONECHAR", + "STRING", + "STRING_LEN", + "STATE_SAVE", + "NOP", + "START_CAPTURE", + "END_CAPTURE", + "URX_STATIC_SETREF", + "SETREF", + "DOTANY", + "JMP", + "FAIL", + "JMP_SAV", + "BACKSLASH_B", + "BACKSLASH_G", + "JMP_SAV_X", + "BACKSLASH_X", + "BACKSLASH_Z", + "DOTANY_ALL", + "BACKSLASH_D", + "CARET", + "DOLLAR", + "CTR_INIT", + "CTR_INIT_NG", + "DOTANY_UNIX", + "CTR_LOOP", + "CTR_LOOP_NG", + "URX_CARET_M_UNIX", + "RELOC_OPRND", + "STO_SP", + "LD_SP", + "BACKREF", + "STO_INP_LOC", + "JMPX", + "LA_START", + "LA_END", + "ONECHAR_I", + "STRING_I", + "BACKREF_I", + "DOLLAR_M", + "CARET_M", + "LB_START", + "LB_CONT", + "LB_END", + "LBN_CONT", + "LBN_END", + "STAT_SETREF_N", + "LOOP_SR_I", + "LOOP_C", + "LOOP_DOT_I", + "BACKSLASH_BU", + "DOLLAR_D", + "DOLLAR_MD", + "URX_BACKSLASH_H", + "URX_BACKSLASH_R", + "URX_BACKSLASH_V", +} + +type instruction int32 + +func (ins instruction) typ() opcode { + return opcode(uint32(ins) >> 24) +} + +func (ins instruction) value32() int32 { + return int32(ins) & 0xffffff +} + +func (ins instruction) value() int { + return int(ins.value32()) +} + +// Access to Unicode Sets composite character properties +// +// The sets are accessed by the match engine for things like \w (word boundary) +const ( + urxIswordSet = 1 + urxIsalnumSet = 2 + urxIsalphaSet = 3 + urxIsspaceSet = 4 + + urxGcNormal = iota + 1 // Sets for finding grapheme cluster boundaries. + urxGcExtend + urxGcControl + urxGcL + urxGcLv + urxGcLvt + urxGcV + urxGcT + + urxNegSet = 0x800000 // Flag bit to reverse sense of set + // membership test. +) + +type stack struct { + ary []int + frameSize int + stackLimit int +} + +type stackFrame []int + +func (f stackFrame) inputIdx() *int { + return &f[0] +} + +func (f stackFrame) patIdx() *int { + return &f[1] +} + +func (f stackFrame) extra(n int) *int { + return &f[2+n] +} + +func (f stackFrame) equals(f2 stackFrame) bool { + return &f[0] == &f2[0] +} + +func (s *stack) len() int { + return len(s.ary) +} + +func (s *stack) sp() int { + return len(s.ary) - s.frameSize +} + +func (s *stack) newFrame(inputIdx int, input []rune, pattern string) (stackFrame, error) { + if s.stackLimit != 0 && len(s.ary)+s.frameSize > s.stackLimit { + return nil, &MatchError{ + Code: StackOverflow, + Pattern: pattern, + Position: inputIdx, + Input: input, + } + } + s.ary = slices.Grow(s.ary, s.frameSize) + + f := s.ary[len(s.ary) : len(s.ary)+s.frameSize] + s.ary = s.ary[:len(s.ary)+s.frameSize] + return f, nil +} + +func (s *stack) prevFromTop() stackFrame { + return s.ary[len(s.ary)-2*s.frameSize:] +} + +func (s *stack) popFrame() stackFrame { + s.ary = s.ary[:len(s.ary)-s.frameSize] + return s.ary[len(s.ary)-s.frameSize:] +} + +func (s *stack) reset() { + s.ary = s.ary[:0] +} + +func (s *stack) offset(size int) stackFrame { + return s.ary[size-s.frameSize : size] +} + +func (s *stack) setSize(size int) { + s.ary = s.ary[:size] +} + +func (f stackFrame) clearExtra() { + for i := 2; i < len(f); i++ { + f[i] = -1 + } +} + +// number of UVector elements in the header +const restackframeHdrCount = 2 + +// Start-Of-Match type. Used by find() to quickly scan to positions where a +// +// match might start before firing up the full match engine. +type startOfMatch int8 + +const ( + startNoInfo startOfMatch = iota // No hint available. + startChar // Match starts with a literal code point. + startSet // Match starts with something matching a set. + startStart // Match starts at start of buffer only (^ or \A) + startLine // Match starts with ^ in multi-line mode. + startString // Match starts with a literal string. +) + +func (som startOfMatch) String() string { + switch som { + case startNoInfo: + return "START_NO_INFO" + case startChar: + return "START_CHAR" + case startSet: + return "START_SET" + case startStart: + return "START_START" + case startLine: + return "START_LINE" + case startString: + return "START_STRING" + default: + panic("unknown StartOfMatch") + } +} + +type caseFoldIterator struct { + chars []rune + index int + limit int + + foldChars []uint16 +} + +func (it *caseFoldIterator) next() rune { + if len(it.foldChars) == 0 { + // We are not in a string folding of an earlier character. + // Start handling the next char from the input UText. + if it.index >= it.limit { + return -1 + } + + originalC := it.chars[it.index] + it.index++ + + originalC, it.foldChars = ucase.FullFolding(originalC) + if len(it.foldChars) == 0 { + // input code point folds to a single code point, possibly itself. + return originalC + } + } + + var res rune + res, it.foldChars = utf16.NextUnsafe(it.foldChars) + return res +} + +func (it *caseFoldIterator) inExpansion() bool { + return len(it.foldChars) > 0 +} + +func newCaseFoldIterator(chars []rune, start, limit int) caseFoldIterator { + return caseFoldIterator{ + chars: chars, + index: start, + limit: limit, + } +} diff --git a/go/mysql/icuregex/pattern.go b/go/mysql/icuregex/pattern.go new file mode 100644 index 00000000000..f0823a213d4 --- /dev/null +++ b/go/mysql/icuregex/pattern.go @@ -0,0 +1,149 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/uset" +) + +type Pattern struct { + pattern string + flags RegexpFlag + + compiledPat []instruction + literalText []rune + + sets []*uset.UnicodeSet + + minMatchLen int32 + frameSize int + dataSize int + + groupMap []int32 + + startType startOfMatch + initialStringIdx int + initialStringLen int + initialChars *uset.UnicodeSet + initialChar rune + needsAltInput bool + + namedCaptureMap map[string]int +} + +func NewPattern(flags RegexpFlag) *Pattern { + return &Pattern{ + flags: flags, + initialChars: uset.New(), + // Slot zero of the vector of sets is reserved. Fill it here. + sets: []*uset.UnicodeSet{nil}, + } +} + +func MustCompileString(in string, flags RegexpFlag) *Pattern { + pat, err := CompileString(in, flags) + if err != nil { + panic(err) + } + return pat +} + +func Compile(in []rune, flags RegexpFlag) (*Pattern, error) { + pat := NewPattern(flags) + cmp := newCompiler(pat) + if err := cmp.compile(in); err != nil { + return nil, err + } + return pat, nil +} + +func CompileString(in string, flags RegexpFlag) (*Pattern, error) { + pat := NewPattern(flags) + cmp := newCompiler(pat) + if err := cmp.compile([]rune(in)); err != nil { + return nil, err + } + return pat, nil +} + +func (p *Pattern) Match(input string) *Matcher { + m := NewMatcher(p) + m.ResetString(input) + return m +} + +type RegexpFlag int32 + +const ( + /** Enable case insensitive matching. @stable ICU 2.4 */ + CaseInsensitive RegexpFlag = 2 + + /** Allow white space and comments within patterns @stable ICU 2.4 */ + Comments RegexpFlag = 4 + + /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. + * @stable ICU 2.4 */ + DotAll RegexpFlag = 32 + + /** If set, treat the entire pattern as a literal string. + * Metacharacters or escape sequences in the input sequence will be given + * no special meaning. + * + * The flag UREGEX_CASE_INSENSITIVE retains its impact + * on matching when used in conjunction with this flag. + * The other flags become superfluous. + * + * @stable ICU 4.0 + */ + Literal RegexpFlag = 16 + + /** Control behavior of "$" and "^" + * If set, recognize line terminators within string, + * otherwise, match only at start and end of input string. + * @stable ICU 2.4 */ + Multiline RegexpFlag = 8 + + /** Unix-only line endings. + * When this mode is enabled, only \\u000a is recognized as a line ending + * in the behavior of ., ^, and $. + * @stable ICU 4.0 + */ + UnixLines RegexpFlag = 1 + + /** Unicode word boundaries. + * If set, \b uses the Unicode TR 29 definition of word boundaries. + * Warning: Unicode word boundaries are quite different from + * traditional regular expression word boundaries. See + * http://unicode.org/reports/tr29/#Word_Boundaries + * @stable ICU 2.8 + */ + UWord RegexpFlag = 256 + + /** Error on Unrecognized backslash escapes. + * If set, fail with an error on patterns that contain + * backslash-escaped ASCII letters without a known special + * meaning. If this flag is not set, these + * escaped letters represent themselves. + * @stable ICU 4.0 + */ + ErrorOnUnknownEscapes RegexpFlag = 512 +) diff --git a/go/mysql/icuregex/perl_test.go b/go/mysql/icuregex/perl_test.go new file mode 100644 index 00000000000..0e7beda9fbd --- /dev/null +++ b/go/mysql/icuregex/perl_test.go @@ -0,0 +1,216 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "bufio" + "os" + "strconv" + "strings" + "testing" +) + +func TestPerl(t *testing.T) { + f, err := os.Open("testdata/re_tests.txt") + if err != nil { + t.Fatalf("failed to open test data: %v", err) + } + defer f.Close() + + flagPat := MustCompileString(`('?)(.*)\1(.*)`, 0) + flagMat := NewMatcher(flagPat) + + groupsPat := MustCompileString(`\$([+\-])\[(\d+)\]`, 0) + groupsMat := NewMatcher(groupsPat) + + cgPat := MustCompileString(`\$(\d+)`, 0) + cgMat := NewMatcher(cgPat) + + group := func(m *Matcher, idx int) string { + g, _ := m.Group(idx) + return g + } + + lookingAt := func(m *Matcher) bool { + ok, err := m.LookingAt() + if err != nil { + t.Fatalf("failed to match with LookingAt(): %v", err) + } + return ok + } + + replacer := strings.NewReplacer( + `${bang}`, "!", + `${nulnul}`, "\x00\x00", + `${ffff}`, "\uffff", + ) + + scanner := bufio.NewScanner(f) + var lineno int + + for scanner.Scan() { + lineno++ + fields := strings.Split(scanner.Text(), "\t") + + flagMat.ResetString(fields[0]) + ok, _ := flagMat.Matches() + if !ok { + t.Fatalf("could not match pattern+flags (line %d)", lineno) + } + + pattern, _ := flagMat.Group(2) + pattern = replacer.Replace(pattern) + + flagStr, _ := flagMat.Group(3) + var flags RegexpFlag + if strings.IndexByte(flagStr, 'i') >= 0 { + flags |= CaseInsensitive + } + if strings.IndexByte(flagStr, 'm') >= 0 { + flags |= Multiline + } + if strings.IndexByte(flagStr, 'x') >= 0 { + flags |= Comments + } + + testPat, err := CompileString(pattern, flags) + if err != nil { + if cerr, ok := err.(*CompileError); ok && cerr.Code == Unimplemented { + continue + } + if strings.IndexByte(fields[2], 'c') == -1 && strings.IndexByte(fields[2], 'i') == -1 { + t.Errorf("line %d: ICU error %q", lineno, err) + } + continue + } + + if strings.IndexByte(fields[2], 'i') >= 0 { + continue + } + if strings.IndexByte(fields[2], 'c') >= 0 { + t.Errorf("line %d: expected error", lineno) + continue + } + + matchString := fields[1] + matchString = replacer.Replace(matchString) + matchString = strings.ReplaceAll(matchString, `\n`, "\n") + + testMat := testPat.Match(matchString) + found, _ := testMat.Find() + expected := strings.IndexByte(fields[2], 'y') >= 0 + + if expected != found { + t.Errorf("line %d: expected %v, found %v", lineno, expected, found) + continue + } + + if !found { + continue + } + + var result []byte + var perlExpr = fields[3] + + for len(perlExpr) > 0 { + groupsMat.ResetString(perlExpr) + cgMat.ResetString(perlExpr) + + switch { + case strings.HasPrefix(perlExpr, "$&"): + result = append(result, group(testMat, 0)...) + perlExpr = perlExpr[2:] + + case lookingAt(groupsMat): + groupNum, err := strconv.ParseInt(group(groupsMat, 2), 10, 32) + if err != nil { + t.Fatalf("failed to parse Perl pattern: %v", err) + } + + var matchPosition int + if group(groupsMat, 1) == "+" { + matchPosition = testMat.EndForGroup(int(groupNum)) + } else { + matchPosition = testMat.StartForGroup(int(groupNum)) + } + if matchPosition != -1 { + result = strconv.AppendInt(result, int64(matchPosition), 10) + } + + perlExpr = perlExpr[groupsMat.EndForGroup(0):] + + case lookingAt(cgMat): + groupNum, err := strconv.ParseInt(group(cgMat, 1), 10, 32) + if err != nil { + t.Fatalf("failed to parse Perl pattern: %v", err) + } + result = append(result, group(testMat, int(groupNum))...) + perlExpr = perlExpr[cgMat.EndForGroup(0):] + + case strings.HasPrefix(perlExpr, "@-"): + for i := 0; i <= testMat.GroupCount(); i++ { + if i > 0 { + result = append(result, ' ') + } + result = strconv.AppendInt(result, int64(testMat.StartForGroup(i)), 10) + } + perlExpr = perlExpr[2:] + + case strings.HasPrefix(perlExpr, "@+"): + for i := 0; i <= testMat.GroupCount(); i++ { + if i > 0 { + result = append(result, ' ') + } + result = strconv.AppendInt(result, int64(testMat.EndForGroup(i)), 10) + } + perlExpr = perlExpr[2:] + + case strings.HasPrefix(perlExpr, "\\"): + if len(perlExpr) > 1 { + perlExpr = perlExpr[1:] + } + c := perlExpr[0] + switch c { + case 'n': + c = '\n' + } + result = append(result, c) + perlExpr = perlExpr[1:] + + default: + result = append(result, perlExpr[0]) + perlExpr = perlExpr[1:] + } + } + + var expectedS string + if len(fields) > 4 { + expectedS = fields[4] + expectedS = replacer.Replace(expectedS) + expectedS = strings.ReplaceAll(expectedS, `\n`, "\n") + } + + if expectedS != string(result) { + t.Errorf("line %d: Incorrect Perl expression results for %s\nwant: %q\ngot: %q", lineno, pattern, expectedS, result) + } + } +} diff --git a/go/mysql/icuregex/sets.go b/go/mysql/icuregex/sets.go new file mode 100644 index 00000000000..0f745b3374d --- /dev/null +++ b/go/mysql/icuregex/sets.go @@ -0,0 +1,104 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "vitess.io/vitess/go/mysql/icuregex/internal/uprops" + "vitess.io/vitess/go/mysql/icuregex/internal/uset" +) + +var staticPropertySets [13]*uset.UnicodeSet + +func init() { + staticPropertySets[urxIswordSet] = func() *uset.UnicodeSet { + s := uset.New() + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Alphabetic}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{M}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Nd}`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`\p{Pc}`, 0)) + s.AddRune(0x200c) + s.AddRune(0x200d) + return s.Freeze() + }() + + staticPropertySets[urxIsspaceSet] = uprops.MustNewUnicodeSetFomPattern(`\p{Whitespace}`, 0).Freeze() + + staticPropertySets[urxGcExtend] = uprops.MustNewUnicodeSetFomPattern(`\p{Grapheme_Extend}`, 0).Freeze() + staticPropertySets[urxGcControl] = func() *uset.UnicodeSet { + s := uset.New() + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Zl:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Zp:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Cc:]`, 0)) + s.AddAll(uprops.MustNewUnicodeSetFomPattern(`[:Cf:]`, 0)) + s.RemoveAll(uprops.MustNewUnicodeSetFomPattern(`[:Grapheme_Extend:]`, 0)) + return s.Freeze() + }() + staticPropertySets[urxGcL] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=L}`, 0).Freeze() + staticPropertySets[urxGcLv] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LV}`, 0).Freeze() + staticPropertySets[urxGcLvt] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=LVT}`, 0).Freeze() + staticPropertySets[urxGcV] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=V}`, 0).Freeze() + staticPropertySets[urxGcT] = uprops.MustNewUnicodeSetFomPattern(`\p{Hangul_Syllable_Type=T}`, 0).Freeze() + + staticPropertySets[urxGcNormal] = func() *uset.UnicodeSet { + s := uset.New() + s.Complement() + s.RemoveRuneRange(0xac00, 0xd7a4) + s.RemoveAll(staticPropertySets[urxGcControl]) + s.RemoveAll(staticPropertySets[urxGcL]) + s.RemoveAll(staticPropertySets[urxGcV]) + s.RemoveAll(staticPropertySets[urxGcT]) + return s.Freeze() + }() +} + +var staticSetUnescape = func() *uset.UnicodeSet { + u := uset.New() + u.AddString("acefnrtuUx") + return u.Freeze() +}() + +const ( + ruleSetDigitChar = 128 + ruleSetASCIILetter = 129 + ruleSetRuleChar = 130 + ruleSetCount = 131 - 128 +) + +var staticRuleSet = [ruleSetCount]*uset.UnicodeSet{ + func() *uset.UnicodeSet { + u := uset.New() + u.AddRuneRange('0', '9') + return u.Freeze() + }(), + func() *uset.UnicodeSet { + u := uset.New() + u.AddRuneRange('A', 'Z') + u.AddRuneRange('a', 'z') + return u.Freeze() + }(), + func() *uset.UnicodeSet { + u := uset.New() + u.AddString("*?+[(){}^$|\\.") + u.Complement() + return u.Freeze() + }(), +} diff --git a/go/mysql/icuregex/sets_test.go b/go/mysql/icuregex/sets_test.go new file mode 100644 index 00000000000..d33552732f2 --- /dev/null +++ b/go/mysql/icuregex/sets_test.go @@ -0,0 +1,66 @@ +/* +© 2016 and later: Unicode, Inc. and others. +Copyright (C) 2004-2015, International Business Machines Corporation and others. +Copyright 2023 The Vitess Authors. + +This file contains code derived from the Unicode Project's ICU library. +License & terms of use for the original code: http://www.unicode.org/copyright.html + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package icuregex + +import ( + "testing" +) + +func TestStaticSetContents(t *testing.T) { + // These are the number of codepoints contained in each of the static sets as of ICU69-1, + // as to sanity check that we're re-creating the sets properly. + // This table must be re-created when updating Unicode versions. + var ExpectedSetSizes = map[int]int{ + 1: 134564, + 4: 25, + 5: 1102451, + 6: 1979, + 7: 131, + 8: 125, + 9: 399, + 10: 10773, + 11: 95, + 12: 137, + } + + for setid, expected := range ExpectedSetSizes { + if got := staticPropertySets[setid].Len(); got != expected { + t.Fatalf("static set [%d] has wrong size: got %d, expected %d", setid, got, expected) + } + } +} + +func TestStaticFreeze(t *testing.T) { + for _, s := range staticPropertySets { + if err := s.FreezeCheck_(); err != nil { + t.Error(err) + } + } + for _, s := range staticRuleSet { + if err := s.FreezeCheck_(); err != nil { + t.Error(err) + } + } + if err := staticSetUnescape.FreezeCheck_(); err != nil { + t.Error(err) + } +} diff --git a/go/mysql/icuregex/testdata/re_tests.txt b/go/mysql/icuregex/testdata/re_tests.txt new file mode 100644 index 00000000000..c18b638f9b3 --- /dev/null +++ b/go/mysql/icuregex/testdata/re_tests.txt @@ -0,0 +1,923 @@ +abc abc y $& abc +abc abc y $-[0] 0 +abc abc y $+[0] 3 +abc xbc n - - +abc axc n - - +abc abx n - - +abc xabcy y $& abc +abc xabcy y $-[0] 1 +abc xabcy y $+[0] 4 +abc ababc y $& abc +abc ababc y $-[0] 2 +abc ababc y $+[0] 5 +ab*c abc y $& abc +ab*c abc y $-[0] 0 +ab*c abc y $+[0] 3 +ab*bc abc y $& abc +ab*bc abc y $-[0] 0 +ab*bc abc y $+[0] 3 +ab*bc abbc y $& abbc +ab*bc abbc y $-[0] 0 +ab*bc abbc y $+[0] 4 +ab*bc abbbbc y $& abbbbc +ab*bc abbbbc y $-[0] 0 +ab*bc abbbbc y $+[0] 6 +.{1} abbbbc y $& a +.{1} abbbbc y $-[0] 0 +.{1} abbbbc y $+[0] 1 +.{3,4} abbbbc y $& abbb +.{3,4} abbbbc y $-[0] 0 +.{3,4} abbbbc y $+[0] 4 +ab{0,}bc abbbbc y $& abbbbc +ab{0,}bc abbbbc y $-[0] 0 +ab{0,}bc abbbbc y $+[0] 6 +ab+bc abbc y $& abbc +ab+bc abbc y $-[0] 0 +ab+bc abbc y $+[0] 4 +ab+bc abc n - - +ab+bc abq n - - +ab{1,}bc abq n - - +ab+bc abbbbc y $& abbbbc +ab+bc abbbbc y $-[0] 0 +ab+bc abbbbc y $+[0] 6 +ab{1,}bc abbbbc y $& abbbbc +ab{1,}bc abbbbc y $-[0] 0 +ab{1,}bc abbbbc y $+[0] 6 +ab{1,3}bc abbbbc y $& abbbbc +ab{1,3}bc abbbbc y $-[0] 0 +ab{1,3}bc abbbbc y $+[0] 6 +ab{3,4}bc abbbbc y $& abbbbc +ab{3,4}bc abbbbc y $-[0] 0 +ab{3,4}bc abbbbc y $+[0] 6 +ab{4,5}bc abbbbc n - - +ab?bc abbc y $& abbc +ab?bc abc y $& abc +ab{0,1}bc abc y $& abc +ab?bc abbbbc n - - +ab?c abc y $& abc +ab{0,1}c abc y $& abc +^abc$ abc y $& abc +^abc$ abcc n - - +^abc abcc y $& abc +^abc$ aabc n - - +abc$ aabc y $& abc +abc$ aabcd n - - +^ abc y $& +$ abc y $& +a.c abc y $& abc +a.c axc y $& axc +a.*c axyzc y $& axyzc +a.*c axyzd n - - +a[bc]d abc n - - +a[bc]d abd y $& abd +a[b-d]e abd n - - +a[b-d]e ace y $& ace +a[b-d] aac y $& ac +a[-b] a- y $& a- +a[b-] a- y $& a- +a[b-a] - c - Invalid [] range "b-a" +a[]b - ci - Unmatched [ +a[ - c - Unmatched [ +a] a] y $& a] +a[]]b a]b y $& a]b +a[^bc]d aed y $& aed +a[^bc]d abd n - - +a[^-b]c adc y $& adc +a[^-b]c a-c n - - +a[^]b]c a]c n - - +a[^]b]c adc y $& adc +\ba\b a- y - - +\ba\b -a y - - +\ba\b -a- y - - +\by\b xy n - - +\by\b yz n - - +\by\b xyz n - - +\Ba\B a- n - - +\Ba\B -a n - - +\Ba\B -a- n - - +\By\b xy y - - +\By\b xy y $-[0] 1 +\By\b xy y $+[0] 2 +\By\b xy y - - +\by\B yz y - - +\By\B xyz y - - +\w a y - - +\w - n - - +\W a n - - +\W - y - - +a\sb a b y - - +a\sb a-b n - - +a\Sb a b n - - +a\Sb a-b y - - +\d 1 y - - +\d - n - - +\D 1 n - - +\D - y - - +[\w] a y - - +[\w] - n - - +[\W] a n - - +[\W] - y - - +a[\s]b a b y - - +a[\s]b a-b n - - +a[\S]b a b n - - +a[\S]b a-b y - - +[\d] 1 y - - +[\d] - n - - +[\D] 1 n - - +[\D] - y - - +ab|cd abc y $& ab +ab|cd abcd y $& ab +()ef def y $&-$1 ef- +()ef def y $-[0] 1 +()ef def y $+[0] 3 +()ef def y $-[1] 1 +()ef def y $+[1] 1 +*a - c - Quantifier follows nothing +(*)b - c - Quantifier follows nothing +$b b n - - +a\ - c - Search pattern not terminated +a\(b a(b y $&-$1 a(b- +a\(*b ab y $& ab +a\(*b a((b y $& a((b +a\\b a\b y $& a\b +abc) - c - Unmatched ) +(abc - c - Unmatched ( +((a)) abc y $&-$1-$2 a-a-a +((a)) abc y $-[0]-$-[1]-$-[2] 0-0-0 +((a)) abc y $+[0]-$+[1]-$+[2] 1-1-1 +((a)) abc by @- 0 0 0 +((a)) abc by @+ 1 1 1 +(a)b(c) abc y $&-$1-$2 abc-a-c +(a)b(c) abc y $-[0]-$-[1]-$-[2] 0-0-2 +(a)b(c) abc y $+[0]-$+[1]-$+[2] 3-1-3 +a+b+c aabbabc y $& abc +a{1,}b{1,}c aabbabc y $& abc +a** - c - Nested quantifiers +a.+?c abcabc y $& abc +(a+|b)* ab y $&-$1 ab-b +(a+|b)* ab y $-[0] 0 +(a+|b)* ab y $+[0] 2 +(a+|b)* ab y $-[1] 1 +(a+|b)* ab y $+[1] 2 +(a+|b){0,} ab y $&-$1 ab-b +(a+|b)+ ab y $&-$1 ab-b +(a+|b){1,} ab y $&-$1 ab-b +(a+|b)? ab y $&-$1 a-a +(a+|b){0,1} ab y $&-$1 a-a +)( - c - Unmatched ) +[^ab]* cde y $& cde +abc n - - +a* y $& +([abc])*d abbbcd y $&-$1 abbbcd-c +([abc])*bcd abcd y $&-$1 abcd-a +a|b|c|d|e e y $& e +(a|b|c|d|e)f ef y $&-$1 ef-e +(a|b|c|d|e)f ef y $-[0] 0 +(a|b|c|d|e)f ef y $+[0] 2 +(a|b|c|d|e)f ef y $-[1] 0 +(a|b|c|d|e)f ef y $+[1] 1 +abcd*efg abcdefg y $& abcdefg +ab* xabyabbbz y $& ab +ab* xayabbbz y $& a +(ab|cd)e abcde y $&-$1 cde-cd +[abhgefdc]ij hij y $& hij +^(ab|cd)e abcde n x$1y xy +(abc|)ef abcdef y $&-$1 ef- +(a|b)c*d abcd y $&-$1 bcd-b +(ab|ab*)bc abc y $&-$1 abc-a +a([bc]*)c* abc y $&-$1 abc-bc +a([bc]*)(c*d) abcd y $&-$1-$2 abcd-bc-d +a([bc]*)(c*d) abcd y $-[0] 0 +a([bc]*)(c*d) abcd y $+[0] 4 +a([bc]*)(c*d) abcd y $-[1] 1 +a([bc]*)(c*d) abcd y $+[1] 3 +a([bc]*)(c*d) abcd y $-[2] 3 +a([bc]*)(c*d) abcd y $+[2] 4 +a([bc]+)(c*d) abcd y $&-$1-$2 abcd-bc-d +a([bc]*)(c+d) abcd y $&-$1-$2 abcd-b-cd +a([bc]*)(c+d) abcd y $-[0] 0 +a([bc]*)(c+d) abcd y $+[0] 4 +a([bc]*)(c+d) abcd y $-[1] 1 +a([bc]*)(c+d) abcd y $+[1] 2 +a([bc]*)(c+d) abcd y $-[2] 2 +a([bc]*)(c+d) abcd y $+[2] 4 +a[bcd]*dcdcde adcdcde y $& adcdcde +a[bcd]+dcdcde adcdcde n - - +(ab|a)b*c abc y $&-$1 abc-ab +(ab|a)b*c abc y $-[0] 0 +(ab|a)b*c abc y $+[0] 3 +(ab|a)b*c abc y $-[1] 0 +(ab|a)b*c abc y $+[1] 2 +((a)(b)c)(d) abcd y $1-$2-$3-$4 abc-a-b-d +((a)(b)c)(d) abcd y $-[0] 0 +((a)(b)c)(d) abcd y $+[0] 4 +((a)(b)c)(d) abcd y $-[1] 0 +((a)(b)c)(d) abcd y $+[1] 3 +((a)(b)c)(d) abcd y $-[2] 0 +((a)(b)c)(d) abcd y $+[2] 1 +((a)(b)c)(d) abcd y $-[3] 1 +((a)(b)c)(d) abcd y $+[3] 2 +((a)(b)c)(d) abcd y $-[4] 3 +((a)(b)c)(d) abcd y $+[4] 4 +[a-zA-Z_][a-zA-Z0-9_]* alpha y $& alpha +^a(bc+|b[eh])g|.h$ abh y $&-$1 bh- +(bc+d$|ef*g.|h?i(j|k)) effgz y $&-$1-$2 effgz-effgz- +(bc+d$|ef*g.|h?i(j|k)) ij y $&-$1-$2 ij-ij-j +(bc+d$|ef*g.|h?i(j|k)) effg n - - +(bc+d$|ef*g.|h?i(j|k)) bcdd n - - +(bc+d$|ef*g.|h?i(j|k)) reffgz y $&-$1-$2 effgz-effgz- +((((((((((a)))))))))) a y $10 a +((((((((((a)))))))))) a y $-[0] 0 +((((((((((a)))))))))) a y $+[0] 1 +((((((((((a)))))))))) a y $-[10] 0 +((((((((((a)))))))))) a y $+[10] 1 +((((((((((a))))))))))\10 aa y $& aa +((((((((((a))))))))))${bang} aa n - - +((((((((((a))))))))))${bang} a! y $& a! +(((((((((a))))))))) a y $& a +multiple words of text uh-uh n - - +multiple words multiple words, yeah y $& multiple words +(.*)c(.*) abcde y $&-$1-$2 abcde-ab-de +\((.*), (.*)\) (a, b) y ($2, $1) (b, a) +[k] ab n - - +abcd abcd y $&-\$&-\\$& abcd-$&-\abcd +a(bc)d abcd y $1-\$1-\\$1 bc-$1-\bc +a[-]?c ac y $& ac +(abc)\1 abcabc y $1 abc +([a-c]*)\1 abcabc y $1 abc +\1 - c - Reference to nonexistent group +\2 - c - Reference to nonexistent group +(a)|\1 a y - - +(a)|\1 x n - - +(a)|\2 - c - Reference to nonexistent group +(([a-c])b*?\2)* ababbbcbc y $&-$1-$2 ababb-bb-b +(([a-c])b*?\2){3} ababbbcbc y $&-$1-$2 ababbbcbc-cbc-c +((\3|b)\2(a)x)+ aaxabxbaxbbx n - - +((\3|b)\2(a)x)+ aaaxabaxbaaxbbax y $&-$1-$2-$3 bbax-bbax-b-a +((\3|b)\2(a)){2,} bbaababbabaaaaabbaaaabba y $&-$1-$2-$3 bbaaaabba-bba-b-a +(a)|(b) b y $-[0] 0 +(a)|(b) b y $+[0] 1 +(a)|(b) b y x$-[1] x +(a)|(b) b y x$+[1] x +(a)|(b) b y $-[2] 0 +(a)|(b) b y $+[2] 1 +'abc'i ABC y $& ABC +'abc'i XBC n - - +'abc'i AXC n - - +'abc'i ABX n - - +'abc'i XABCY y $& ABC +'abc'i ABABC y $& ABC +'ab*c'i ABC y $& ABC +'ab*bc'i ABC y $& ABC +'ab*bc'i ABBC y $& ABBC +'ab*?bc'i ABBBBC y $& ABBBBC +'ab{0,}?bc'i ABBBBC y $& ABBBBC +'ab+?bc'i ABBC y $& ABBC +'ab+bc'i ABC n - - +'ab+bc'i ABQ n - - +'ab{1,}bc'i ABQ n - - +'ab+bc'i ABBBBC y $& ABBBBC +'ab{1,}?bc'i ABBBBC y $& ABBBBC +'ab{1,3}?bc'i ABBBBC y $& ABBBBC +'ab{3,4}?bc'i ABBBBC y $& ABBBBC +'ab{4,5}?bc'i ABBBBC n - - +'ab??bc'i ABBC y $& ABBC +'ab??bc'i ABC y $& ABC +'ab{0,1}?bc'i ABC y $& ABC +'ab??bc'i ABBBBC n - - +'ab??c'i ABC y $& ABC +'ab{0,1}?c'i ABC y $& ABC +'^abc$'i ABC y $& ABC +'^abc$'i ABCC n - - +'^abc'i ABCC y $& ABC +'^abc$'i AABC n - - +'abc$'i AABC y $& ABC +'^'i ABC y $& +'$'i ABC y $& +'a.c'i ABC y $& ABC +'a.c'i AXC y $& AXC +'a.*?c'i AXYZC y $& AXYZC +'a.*c'i AXYZD n - - +'a[bc]d'i ABC n - - +'a[bc]d'i ABD y $& ABD +'a[b-d]e'i ABD n - - +'a[b-d]e'i ACE y $& ACE +'a[b-d]'i AAC y $& AC +'a[-b]'i A- y $& A- +'a[b-]'i A- y $& A- +'a[b-a]'i - c - Invalid [] range "b-a" +'a[]b'i - ci - Unmatched [ +'a['i - c - Unmatched [ +'a]'i A] y $& A] +'a[]]b'i A]B y $& A]B +'a[^bc]d'i AED y $& AED +'a[^bc]d'i ABD n - - +'a[^-b]c'i ADC y $& ADC +'a[^-b]c'i A-C n - - +'a[^]b]c'i A]C n - - +'a[^]b]c'i ADC y $& ADC +'ab|cd'i ABC y $& AB +'ab|cd'i ABCD y $& AB +'()ef'i DEF y $&-$1 EF- +'*a'i - c - Quantifier follows nothing +'(*)b'i - c - Quantifier follows nothing +'$b'i B n - - +'a\'i - c - Search pattern not terminated +'a\(b'i A(B y $&-$1 A(B- +'a\(*b'i AB y $& AB +'a\(*b'i A((B y $& A((B +'a\\b'i A\B y $& A\B +'abc)'i - c - Unmatched ) +'(abc'i - c - Unmatched ( +'((a))'i ABC y $&-$1-$2 A-A-A +'(a)b(c)'i ABC y $&-$1-$2 ABC-A-C +'a+b+c'i AABBABC y $& ABC +'a{1,}b{1,}c'i AABBABC y $& ABC +'a**'i - c - Nested quantifiers +'a.+?c'i ABCABC y $& ABC +'a.*?c'i ABCABC y $& ABC +'a.{0,5}?c'i ABCABC y $& ABC +'(a+|b)*'i AB y $&-$1 AB-B +'(a+|b){0,}'i AB y $&-$1 AB-B +'(a+|b)+'i AB y $&-$1 AB-B +'(a+|b){1,}'i AB y $&-$1 AB-B +'(a+|b)?'i AB y $&-$1 A-A +'(a+|b){0,1}'i AB y $&-$1 A-A +'(a+|b){0,1}?'i AB y $&-$1 - +')('i - c - Unmatched ) +'[^ab]*'i CDE y $& CDE +'abc'i n - - +'a*'i y $& +'([abc])*d'i ABBBCD y $&-$1 ABBBCD-C +'([abc])*bcd'i ABCD y $&-$1 ABCD-A +'a|b|c|d|e'i E y $& E +'(a|b|c|d|e)f'i EF y $&-$1 EF-E +'abcd*efg'i ABCDEFG y $& ABCDEFG +'ab*'i XABYABBBZ y $& AB +'ab*'i XAYABBBZ y $& A +'(ab|cd)e'i ABCDE y $&-$1 CDE-CD +'[abhgefdc]ij'i HIJ y $& HIJ +'^(ab|cd)e'i ABCDE n x$1y XY +'(abc|)ef'i ABCDEF y $&-$1 EF- +'(a|b)c*d'i ABCD y $&-$1 BCD-B +'(ab|ab*)bc'i ABC y $&-$1 ABC-A +'a([bc]*)c*'i ABC y $&-$1 ABC-BC +'a([bc]*)(c*d)'i ABCD y $&-$1-$2 ABCD-BC-D +'a([bc]+)(c*d)'i ABCD y $&-$1-$2 ABCD-BC-D +'a([bc]*)(c+d)'i ABCD y $&-$1-$2 ABCD-B-CD +'a[bcd]*dcdcde'i ADCDCDE y $& ADCDCDE +'a[bcd]+dcdcde'i ADCDCDE n - - +'(ab|a)b*c'i ABC y $&-$1 ABC-AB +'((a)(b)c)(d)'i ABCD y $1-$2-$3-$4 ABC-A-B-D +'[a-zA-Z_][a-zA-Z0-9_]*'i ALPHA y $& ALPHA +'^a(bc+|b[eh])g|.h$'i ABH y $&-$1 BH- +'(bc+d$|ef*g.|h?i(j|k))'i EFFGZ y $&-$1-$2 EFFGZ-EFFGZ- +'(bc+d$|ef*g.|h?i(j|k))'i IJ y $&-$1-$2 IJ-IJ-J +'(bc+d$|ef*g.|h?i(j|k))'i EFFG n - - +'(bc+d$|ef*g.|h?i(j|k))'i BCDD n - - +'(bc+d$|ef*g.|h?i(j|k))'i REFFGZ y $&-$1-$2 EFFGZ-EFFGZ- +'((((((((((a))))))))))'i A y $10 A +'((((((((((a))))))))))\10'i AA y $& AA +'((((((((((a))))))))))${bang}'i AA n - - +'((((((((((a))))))))))${bang}'i A! y $& A! +'(((((((((a)))))))))'i A y $& A +'(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))'i A y $1 A +'(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))'i C y $1 C +'multiple words of text'i UH-UH n - - +'multiple words'i MULTIPLE WORDS, YEAH y $& MULTIPLE WORDS +'(.*)c(.*)'i ABCDE y $&-$1-$2 ABCDE-AB-DE +'\((.*), (.*)\)'i (A, B) y ($2, $1) (B, A) +'[k]'i AB n - - +'abcd'i ABCD y $&-\$&-\\$& ABCD-$&-\ABCD +'a(bc)d'i ABCD y $1-\$1-\\$1 BC-$1-\BC +'a[-]?c'i AC y $& AC +'(abc)\1'i ABCABC y $1 ABC +'([a-c]*)\1'i ABCABC y $1 ABC +a(?!b). abad y $& ad +a(?=d). abad y $& ad +a(?=c|d). abad y $& ad +a(?:b|c|d)(.) ace y $1 e +a(?:b|c|d)*(.) ace y $1 e +a(?:b|c|d)+?(.) ace y $1 e +a(?:b|c|d)+?(.) acdbcdbe y $1 d +a(?:b|c|d)+(.) acdbcdbe y $1 e +a(?:b|c|d){2}(.) acdbcdbe y $1 b +a(?:b|c|d){4,5}(.) acdbcdbe y $1 b +a(?:b|c|d){4,5}?(.) acdbcdbe y $1 d +((foo)|(bar))* foobar y $1-$2-$3 bar-foo-bar +:(?: - c - Sequence (? incomplete +a(?:b|c|d){6,7}(.) acdbcdbe y $1 e +a(?:b|c|d){6,7}?(.) acdbcdbe y $1 e +a(?:b|c|d){5,6}(.) acdbcdbe y $1 e +a(?:b|c|d){5,6}?(.) acdbcdbe y $1 b +a(?:b|c|d){5,7}(.) acdbcdbe y $1 e +a(?:b|c|d){5,7}?(.) acdbcdbe y $1 b +a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce +^(.+)?B AB y $1 A +^([^a-z])|(\^)$ . y $1 . +^[<>]& <&OUT y $& <& +^(a\1?){4}$ aaaaaaaaaa y $1 aaaa +^(a\1?){4}$ aaaaaaaaa n - - +^(a\1?){4}$ aaaaaaaaaaa n - - +^(a(?(1)\1)){4}$ aaaaaaaaaa y $1 aaaa +^(a(?(1)\1)){4}$ aaaaaaaaa n - - +^(a(?(1)\1)){4}$ aaaaaaaaaaa n - - +((a{4})+) aaaaaaaaa y $1 aaaaaaaa +(((aa){2})+) aaaaaaaaaa y $1 aaaaaaaa +(((a{2}){2})+) aaaaaaaaaa y $1 aaaaaaaa +(?:(f)(o)(o)|(b)(a)(r))* foobar y $1:$2:$3:$4:$5:$6 f:o:o:b:a:r +(?<=a)b ab y $& b +(?<=a)b cb n - - +(?<=a)b b n - - +(?a+)ab aaab n - - +(?>a+)b aaab y - - +([[:]+) a:[b]: yi $1 :[ Java and ICU dont escape [[xyz +([[=]+) a=[b]= yi $1 =[ Java and ICU dont escape [[xyz +([[.]+) a.[b]. yi $1 .[ Java and ICU dont escape [[xyz +[a[:xyz: - c - Unmatched [ +[a[:xyz:] - c - POSIX class [:xyz:] unknown +[a[:]b[:c] abc yi $& abc Java and ICU embedded [ is nested set +([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown +[a[:]b[:c] abc iy $& abc Java and ICU embedded [ is nested set +([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd +([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy +([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul} +([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul} +([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 +([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd +([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- +([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 +([[:word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__ +([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB +([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01 +([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 +([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff} +([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff} +([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd +([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB +([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff} +([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy +([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 -- ${nulnul}${ffff} +([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01 +([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff} +[[:foo:]] - c - POSIX class [:foo:] unknown +[[:^foo:]] - c - POSIX class [:^foo:] unknown +((?>a+)b) aaab y $1 aaab +(?>(a+))b aaab y $1 aaa +((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x +(?<=x+)y - c - Variable length lookbehind not implemented +a{37,17} - c - Can't do {n,m} with n > m +\Z a\nb\n y $-[0] 3 +\z a\nb\n y $-[0] 4 +$ a\nb\n y $-[0] 3 +\Z b\na\n y $-[0] 3 +\z b\na\n y $-[0] 4 +$ b\na\n y $-[0] 3 +\Z b\na y $-[0] 3 +\z b\na y $-[0] 3 +$ b\na y $-[0] 3 +'\Z'm a\nb\n y $-[0] 3 +'\z'm a\nb\n y $-[0] 4 +'$'m a\nb\n y $-[0] 1 +'\Z'm b\na\n y $-[0] 3 +'\z'm b\na\n y $-[0] 4 +'$'m b\na\n y $-[0] 1 +'\Z'm b\na y $-[0] 3 +'\z'm b\na y $-[0] 3 +'$'m b\na y $-[0] 1 +a\Z a\nb\n n - - +a\z a\nb\n n - - +a$ a\nb\n n - - +a\Z b\na\n y $-[0] 2 +a\z b\na\n n - - +a$ b\na\n y $-[0] 2 +a\Z b\na y $-[0] 2 +a\z b\na y $-[0] 2 +a$ b\na y $-[0] 2 +'a\Z'm a\nb\n n - - +'a\z'm a\nb\n n - - +'a$'m a\nb\n y $-[0] 0 +'a\Z'm b\na\n y $-[0] 2 +'a\z'm b\na\n n - - +'a$'m b\na\n y $-[0] 2 +'a\Z'm b\na y $-[0] 2 +'a\z'm b\na y $-[0] 2 +'a$'m b\na y $-[0] 2 +aa\Z aa\nb\n n - - +aa\z aa\nb\n n - - +aa$ aa\nb\n n - - +aa\Z b\naa\n y $-[0] 2 +aa\z b\naa\n n - - +aa$ b\naa\n y $-[0] 2 +aa\Z b\naa y $-[0] 2 +aa\z b\naa y $-[0] 2 +aa$ b\naa y $-[0] 2 +'aa\Z'm aa\nb\n n - - +'aa\z'm aa\nb\n n - - +'aa$'m aa\nb\n y $-[0] 0 +'aa\Z'm b\naa\n y $-[0] 2 +'aa\z'm b\naa\n n - - +'aa$'m b\naa\n y $-[0] 2 +'aa\Z'm b\naa y $-[0] 2 +'aa\z'm b\naa y $-[0] 2 +'aa$'m b\naa y $-[0] 2 +aa\Z ac\nb\n n - - +aa\z ac\nb\n n - - +aa$ ac\nb\n n - - +aa\Z b\nac\n n - - +aa\z b\nac\n n - - +aa$ b\nac\n n - - +aa\Z b\nac n - - +aa\z b\nac n - - +aa$ b\nac n - - +'aa\Z'm ac\nb\n n - - +'aa\z'm ac\nb\n n - - +'aa$'m ac\nb\n n - - +'aa\Z'm b\nac\n n - - +'aa\z'm b\nac\n n - - +'aa$'m b\nac\n n - - +'aa\Z'm b\nac n - - +'aa\z'm b\nac n - - +'aa$'m b\nac n - - +aa\Z ca\nb\n n - - +aa\z ca\nb\n n - - +aa$ ca\nb\n n - - +aa\Z b\nca\n n - - +aa\z b\nca\n n - - +aa$ b\nca\n n - - +aa\Z b\nca n - - +aa\z b\nca n - - +aa$ b\nca n - - +'aa\Z'm ca\nb\n n - - +'aa\z'm ca\nb\n n - - +'aa$'m ca\nb\n n - - +'aa\Z'm b\nca\n n - - +'aa\z'm b\nca\n n - - +'aa$'m b\nca\n n - - +'aa\Z'm b\nca n - - +'aa\z'm b\nca n - - +'aa$'m b\nca n - - +ab\Z ab\nb\n n - - +ab\z ab\nb\n n - - +ab$ ab\nb\n n - - +ab\Z b\nab\n y $-[0] 2 +ab\z b\nab\n n - - +ab$ b\nab\n y $-[0] 2 +ab\Z b\nab y $-[0] 2 +ab\z b\nab y $-[0] 2 +ab$ b\nab y $-[0] 2 +'ab\Z'm ab\nb\n n - - +'ab\z'm ab\nb\n n - - +'ab$'m ab\nb\n y $-[0] 0 +'ab\Z'm b\nab\n y $-[0] 2 +'ab\z'm b\nab\n n - - +'ab$'m b\nab\n y $-[0] 2 +'ab\Z'm b\nab y $-[0] 2 +'ab\z'm b\nab y $-[0] 2 +'ab$'m b\nab y $-[0] 2 +ab\Z ac\nb\n n - - +ab\z ac\nb\n n - - +ab$ ac\nb\n n - - +ab\Z b\nac\n n - - +ab\z b\nac\n n - - +ab$ b\nac\n n - - +ab\Z b\nac n - - +ab\z b\nac n - - +ab$ b\nac n - - +'ab\Z'm ac\nb\n n - - +'ab\z'm ac\nb\n n - - +'ab$'m ac\nb\n n - - +'ab\Z'm b\nac\n n - - +'ab\z'm b\nac\n n - - +'ab$'m b\nac\n n - - +'ab\Z'm b\nac n - - +'ab\z'm b\nac n - - +'ab$'m b\nac n - - +ab\Z ca\nb\n n - - +ab\z ca\nb\n n - - +ab$ ca\nb\n n - - +ab\Z b\nca\n n - - +ab\z b\nca\n n - - +ab$ b\nca\n n - - +ab\Z b\nca n - - +ab\z b\nca n - - +ab$ b\nca n - - +'ab\Z'm ca\nb\n n - - +'ab\z'm ca\nb\n n - - +'ab$'m ca\nb\n n - - +'ab\Z'm b\nca\n n - - +'ab\z'm b\nca\n n - - +'ab$'m b\nca\n n - - +'ab\Z'm b\nca n - - +'ab\z'm b\nca n - - +'ab$'m b\nca n - - +abb\Z abb\nb\n n - - +abb\z abb\nb\n n - - +abb$ abb\nb\n n - - +abb\Z b\nabb\n y $-[0] 2 +abb\z b\nabb\n n - - +abb$ b\nabb\n y $-[0] 2 +abb\Z b\nabb y $-[0] 2 +abb\z b\nabb y $-[0] 2 +abb$ b\nabb y $-[0] 2 +'abb\Z'm abb\nb\n n - - +'abb\z'm abb\nb\n n - - +'abb$'m abb\nb\n y $-[0] 0 +'abb\Z'm b\nabb\n y $-[0] 2 +'abb\z'm b\nabb\n n - - +'abb$'m b\nabb\n y $-[0] 2 +'abb\Z'm b\nabb y $-[0] 2 +'abb\z'm b\nabb y $-[0] 2 +'abb$'m b\nabb y $-[0] 2 +abb\Z ac\nb\n n - - +abb\z ac\nb\n n - - +abb$ ac\nb\n n - - +abb\Z b\nac\n n - - +abb\z b\nac\n n - - +abb$ b\nac\n n - - +abb\Z b\nac n - - +abb\z b\nac n - - +abb$ b\nac n - - +'abb\Z'm ac\nb\n n - - +'abb\z'm ac\nb\n n - - +'abb$'m ac\nb\n n - - +'abb\Z'm b\nac\n n - - +'abb\z'm b\nac\n n - - +'abb$'m b\nac\n n - - +'abb\Z'm b\nac n - - +'abb\z'm b\nac n - - +'abb$'m b\nac n - - +abb\Z ca\nb\n n - - +abb\z ca\nb\n n - - +abb$ ca\nb\n n - - +abb\Z b\nca\n n - - +abb\z b\nca\n n - - +abb$ b\nca\n n - - +abb\Z b\nca n - - +abb\z b\nca n - - +abb$ b\nca n - - +'abb\Z'm ca\nb\n n - - +'abb\z'm ca\nb\n n - - +'abb$'m ca\nb\n n - - +'abb\Z'm b\nca\n n - - +'abb\z'm b\nca\n n - - +'abb$'m b\nca\n n - - +'abb\Z'm b\nca n - - +'abb\z'm b\nca n - - +'abb$'m b\nca n - - +(^|x)(c) ca y $2 c +a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz x n - - +a(?{$a=2;$b=3;($b)=$a})b yabz y $b 2 +round\(((?>[^()]+))\) _I(round(xs * sz),1) y $1 xs * sz +'((?x:.) )' x y $1- x - +'((?-x:.) )'x x y $1- x- +foo.bart foo.bart y - - +'^d[x][x][x]'m abcd\ndxxx y - - +.X(.+)+X bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - # TODO: ICU doesn't optimize on trailing literals in pattern. +.X(.+)+XX bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.XX(.+)+X bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+X bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+XX bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.XX(.+)+X bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.XX(.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.X(.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.X(.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.XX(.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X](.+)+[X] bbbbXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X](.+)+[X][X] bbbbXcXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X][X](.+)+[X] bbbbXXcXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa yi - - +.[X](.+)+[X] bbbbXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X](.+)+[X][X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - - +tt+$ xxxtt y - - +([a-\d]+) za-9z yi $1 a-9 +([\d-z]+) a0-za y $1 0-z +([\d-\s]+) a0- z y $1 0- +([a-[:digit:]]+) za-9z y $1 a-9 +([[:digit:]-z]+) =0-z= y $1 0-z +([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z Set difference in ICU +\GX.*X aaaXbX n - - +(\d+\.\d+) 3.1415926 y $1 3.1415926 +(\ba.{0,10}br) have a web browser y $1 a web br +'\.c(pp|xx|c)?$'i Changes n - - +'\.c(pp|xx|c)?$'i IO.c y - - +'(\.c(pp|xx|c)?$)'i IO.c y $1 .c +^([a-z]:) C:/ n - - +'^\S\s+aa$'m \nx aa y - - +(^|a)b ab y - - +^([ab]*?)(b)?(c)$ abac y -$2- -- +(\w)?(abc)\1b abcab n - - +^(?:.,){2}c a,b,c y - - +^(.,){2}c a,b,c y $1 b, +^(?:[^,]*,){2}c a,b,c y - - +^([^,]*,){2}c a,b,c y $1 b, +^([^,]*,){3}d aaa,b,c,d y $1 c, +^([^,]*,){3,}d aaa,b,c,d y $1 c, +^([^,]*,){0,3}d aaa,b,c,d y $1 c, +^([^,]{1,3},){3}d aaa,b,c,d y $1 c, +^([^,]{1,3},){3,}d aaa,b,c,d y $1 c, +^([^,]{1,3},){0,3}d aaa,b,c,d y $1 c, +^([^,]{1,},){3}d aaa,b,c,d y $1 c, +^([^,]{1,},){3,}d aaa,b,c,d y $1 c, +^([^,]{1,},){0,3}d aaa,b,c,d y $1 c, +^([^,]{0,3},){3}d aaa,b,c,d y $1 c, +^([^,]{0,3},){3,}d aaa,b,c,d y $1 c, +^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c, +(?i) y - - +'(?!\A)x'm a\nxb\n y - - +^(a(b)?)+$ aba yi -$1-$2- -a-- Java disagrees. Not clear who is right. +'^.{9}abc.*\n'm 123\nabcabcabcabc\n y - - +^(a)?a$ a y -$1- -- +^(a)?(?(1)a|b)+$ a n - - +^(a\1?)(a\1?)(a\2?)(a\3?)$ aaaaaa y $1,$2,$3,$4 a,aa,a,aa +^(a\1?){4}$ aaaaaa y $1 aa +^(0+)?(?:x(1))? x1 y - - +^([0-9a-fA-F]+)(?:x([0-9a-fA-F]+)?)(?:x([0-9a-fA-F]+))? 012cxx0190 y - - +^(b+?|a){1,2}c bbbac y $1 a +^(b+?|a){1,2}c bbbbac y $1 a +\((\w\. \w+)\) cd. (A. Tw) y -$1- -A. Tw- +((?:aaaa|bbbb)cccc)? aaaacccc y - - +((?:aaaa|bbbb)cccc)? bbbbcccc y - - +(a)?(a)+ a y $1:$2 :a - +(ab)?(ab)+ ab y $1:$2 :ab - +(abc)?(abc)+ abc y $1:$2 :abc - +'b\s^'m a\nb\n n - - +\ba a y - - +^(a(??{"(?!)"})|(a)(?{1}))b ab yi $2 a # [ID 20010811.006] +ab(?i)cd AbCd n - - # [ID 20010809.023] +ab(?i)cd abCd y - - +(A|B)*(?(1)(CD)|(CD)) CD y $2-$3 -CD +(A|B)*(?(1)(CD)|(CD)) ABCD y $2-$3 CD- +(A|B)*?(?(1)(CD)|(CD)) CD y $2-$3 -CD # [ID 20010803.016] +(A|B)*?(?(1)(CD)|(CD)) ABCD y $2-$3 CD- +'^(o)(?!.*\1)'i Oo n - - +(.*)\d+\1 abc12bc y $1 bc +(?m:(foo\s*$)) foo\n bar y $1 foo +(.*)c abcd y $1 ab +(.*)(?=c) abcd y $1 ab +(.*)(?=c)c abcd yB $1 ab +(.*)(?=b|c) abcd y $1 ab +(.*)(?=b|c)c abcd y $1 ab +(.*)(?=c|b) abcd y $1 ab +(.*)(?=c|b)c abcd y $1 ab +(.*)(?=[bc]) abcd y $1 ab +(.*)(?=[bc])c abcd yB $1 ab +(.*)(?<=b) abcd y $1 ab +(.*)(?<=b)c abcd y $1 ab +(.*)(?<=b|c) abcd y $1 abc +(.*)(?<=b|c)c abcd y $1 ab +(.*)(?<=c|b) abcd y $1 abc +(.*)(?<=c|b)c abcd y $1 ab +(.*)(?<=[bc]) abcd y $1 abc +(.*)(?<=[bc])c abcd y $1 ab +(.*?)c abcd y $1 ab +(.*?)(?=c) abcd y $1 ab +(.*?)(?=c)c abcd yB $1 ab +(.*?)(?=b|c) abcd y $1 a +(.*?)(?=b|c)c abcd y $1 ab +(.*?)(?=c|b) abcd y $1 a +(.*?)(?=c|b)c abcd y $1 ab +(.*?)(?=[bc]) abcd y $1 a +(.*?)(?=[bc])c abcd yB $1 ab +(.*?)(?<=b) abcd y $1 ab +(.*?)(?<=b)c abcd y $1 ab +(.*?)(?<=b|c) abcd y $1 ab +(.*?)(?<=b|c)c abcd y $1 ab +(.*?)(?<=c|b) abcd y $1 ab +(.*?)(?<=c|b)c abcd y $1 ab +(.*?)(?<=[bc]) abcd y $1 ab +(.*?)(?<=[bc])c abcd y $1 ab +2(]*)?$\1 2 y $& 2 +(??{}) x yi - - diff --git a/go/mysql/icuregex/testdata/regextst.txt b/go/mysql/icuregex/testdata/regextst.txt new file mode 100644 index 00000000000..8d5d2c34a8e --- /dev/null +++ b/go/mysql/icuregex/testdata/regextst.txt @@ -0,0 +1,2793 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2001-2015 International Business Machines +# Corporation and others. All Rights Reserved. +# +# file: +# +# ICU regular expression test cases. +# +# format: one test case per line, +# = [# comment] +# = "" +# = "" +# the quotes on the pattern and match string can be " or ' or / +# = text, with the start and end of each +# capture group tagged with .... The overall match, +# if any, is group 0, as in <0>matched text +# A region can be specified with ... tags. +# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear. +# +# = any combination of +# i case insensitive match +# x free spacing and comments +# s dot-matches-all mode +# m multi-line mode. +# ($ and ^ match at embedded new-lines) +# D Unix Lines mode (only recognize 0x0a as new-line) +# Q UREGEX_LITERAL flag. Entire pattern is literal string. +# v If icu configured without break iteration, this +# regex test pattern should not compile. +# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag +# d dump the compiled pattern +# t trace operation of match engine. +# 2-9 a digit between 2 and 9, specifies the number of +# times to execute find(). The expected results are +# for the last find() in the sequence. +# G Only check match / no match. Do not check capture groups. +# E Pattern compilation error expected +# L Use LookingAt() rather than find() +# M Use matches() rather than find(). +# +# a Use non-Anchoring Bounds. +# b Use Transparent Bounds. +# The a and b options only make a difference if +# a region has been specified in the string. +# z|Z hitEnd was expected(z) or not expected (Z). +# With neither, hitEnd is not checked. +# y|Y Require End expected(y) or not expected (Y). +# +# White space must be present between the flags and the match string. +# + +# Look-ahead expressions +# +"(?!0{5})(\d{5})" "<0><1>00001zzzz" +"(?!0{5})(\d{5})z" "<0><1>00001zzzz" +"(?!0{5})(\d{5})(?!y)" "<0><1>00001zzzz" +"abc(?=def)" "<0>abcdef" +"(.*)(?=c)" "<0><1>abcdef" + +"(?:.*)(?=c)" "abcdef" +"(?:.*)(?=c)" b "<0>abcdef" # transparent bounds +"(?:.*)(?=c)" bM "<0>abcdef" # transparent bounds + +"(?:.*)(?=(c))" b "<0>ab<1>cdef" # Capture in look-ahead +"(?=(.)\1\1)\1" "abcc<0><1>dddefg" # Backrefs to look-ahead capture + +".(?!\p{L})" "abc<0>d " # Negated look-ahead +".(?!(\p{L}))" "abc<0>d " # Negated look-ahead, no capture + # visible outside of look-ahead +"and(?=roid)" L "<0>android" +"and(?=roid)" M "android" +"and(?=roid)" bM "<0>android" + +"and(?!roid)" L "<0>androix" +"and(?!roid)" L "android" + +"and(?!roid)" M "<0>android" # Opaque bounds +"and(?!roid)" bM "android" +"and(?!roid)" bM "<0>androix" + +# +# Negated Lookahead, various regions and region transparency +# +"abc(?!def)" "<0>abcxyz" +"abc(?!def)" "abcdef" +"abc(?!def)" "<0>abcdef" +"abc(?!def)" b "abcdef" +"abc(?!def)" b "<0>abcxyz" + +# +# Nested Lookahead / Behind +# +"one(?=(?:(?!).)*)" "<0>one stuff" +"one(?=(?:(?!).)*)" "one " + +# More nesting lookaround: pattern matches "qq" when not preceded by 'a' and followed by 'z' +"(?qqc" +"(?qqc" +"(?A<0>jk<2>B" +"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "ajkB" +"(?=(?<=(\p{Lu})(?=..(\p{Lu})))).." "Ajkb" + +# Nested lookaround cases from bug ICU-20564 +"(?<=(?<=((?=)){0}+))" "<0>abc" +"(?<=c(?<=c((?=c)){1}+))" "c<0><1>cc" + +# +# Anchoring Bounds +# +"^def$" "abc<0>defghi" # anchoring (default) bounds +"^def$" a "abcdefghi" # non-anchoring bounds +"^def" a "<0>defghi" # non-anchoring bounds +"def$" a "abc<0>def" # non-anchoring bounds + +"^.*$" m "<0>line 1\n line 2" +"^.*$" m2 "line 1\n<0> line 2" +"^.*$" m3 "line 1\n line 2" +"^.*$" m "li<0>ne 1\n line 2" # anchoring bounds +"^.*$" m2 "line 1\n line 2" # anchoring bounds +"^.*$" am "line 1\n line 2" # non-anchoring bounds +"^.*$" am "li\n<0>ne \n1\n line 2" # non-anchoring bounds + +# +# HitEnd and RequireEnd for new-lines just before end-of-input +# +"xyz$" yz "<0>xyz\n" +"xyz$" yz "<0>xyz\x{d}\x{a}" + +"xyz$" myz "<0>xyz" # multi-line mode +"xyz$" mYZ "<0>xyz\n" +"xyz$" mYZ "<0>xyz\r\n" +"xyz$" mYZ "<0>xyz\x{85}abcd" + +"xyz$" Yz "xyz\nx" +"xyz$" Yz "xyza" +"xyz$" yz "<0>xyz" + +# +# HitEnd +# +"abcd" Lz "a" +"abcd" Lz "ab" +"abcd" Lz "abc" +"abcd" LZ "<0>abcd" +"abcd" LZ "<0>abcde" +"abcd" LZ "abcx" +"abcd" LZ "abx" +"abcd" Lzi "a" +"abcd" Lzi "ab" +"abcd" Lzi "abc" +"abcd" LZi "<0>abcd" +"abcd" LZi "<0>abcde" +"abcd" LZi "abcx" +"abcd" LZi "abx" + +# +# All Unicode line endings recognized. +# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029 +# Multi-line and non-multiline mode take different paths, so repeated tests. +# +"^def$" mYZ "abc\x{a}<0>def\x{a}ghi" +"^def$" mYZ "abc\x{b}<0>def\x{b}ghi" +"^def$" mYZ "abc\x{c}<0>def\x{c}ghi" +"^def$" mYZ "abc\x{d}<0>def\x{d}ghi" +"^def$" mYZ "abc\x{85}<0>def\x{85}ghi" +"^def$" mYZ "abc\x{2028}<0>def\x{2028}ghi" +"^def$" mYZ "abc\x{2029}<0>def\x{2029}ghi" +"^def$" mYZ "abc\r\n<0>def\r\nghi" + +"^def$" yz "<0>def\x{a}" +"^def$" yz "<0>def\x{b}" +"^def$" yz "<0>def\x{c}" +"^def$" yz "<0>def\x{d}" +"^def$" yz "<0>def\x{85}" +"^def$" yz "<0>def\x{2028}" +"^def$" yz "<0>def\x{2029}" +"^def$" yz "<0>def\r\n" +"^def$" yz "<0>def" + + +# "^def$" "<0>def\x{2028" #TODO: should be an error of some sort. + +# +# UNIX_LINES mode +# +"abc$" D "<0>abc\n" +"abc$" D "abc\r" +"abc$" D "abc\u0085" +"a.b" D "<0>a\rb" +"a.b" D "a\nb" +"(?d)abc$" "<0>abc\n" +"(?d)abc$" "abc\r" +"abc$" mD "<0>abc\ndef" +"abc$" mD "abc\rdef" + +".*def" L "abc\r def xyz" # Normal mode, LookingAt() stops at \r +".*def" DL "<0>abc\r def xyz" # Unix Lines mode, \r not line end. +".*def" DL "abc\n def xyz" + +"(?d)a.b" "a\nb" +"(?d)a.b" "<0>a\rb" + +"^abc" m "xyz\r<0>abc" +"^abc" Dm "xyz\rabc" +"^abc" Dm "xyz\n<0>abc" + + + +# Capturing parens +".(..)." "<0>a<1>bcd" + ".*\A( +hello)" "<0><1> hello" +"(hello)|(goodbye)" "<0><1>hello" +"(hello)|(goodbye)" "<0><2>goodbye" +"abc( +( inner(X?) +) xyz)" "leading cruft <0>abc<1> <2> inner<3> xyz cruft" +"\s*([ixsmdt]*)([:letter:]*)" "<0> <1>d<2> " +"(a|b)c*d" "a<0><1>bcd" + +# Non-capturing parens (?: stuff). Groups, but does not capture. +"(?:abc)*(tail)" "<0>abcabcabc<1>tail" + +# Non-greedy *? quantifier +".*?(abc)" "<0> abx <1>abc abc abc abc" +".*(abc)" "<0> abx abc abc abc <1>abc" + +"((?:abc |xyz )*?)abc " "<0><1>xyz abc abc abc " +"((?:abc |xyz )*)abc " "<0><1>xyz abc abc abc " + +# Non-greedy +? quantifier +"(a+?)(a*)" "<0><1>a<2>aaaaaaaaaaaa" +"(a+)(a*)" "<0><1>aaaaaaaaaaaaa<2>" + +"((ab)+?)((ab)*)" "<0><1><2>ab<3>ababababab<4>ab" +"((ab)+)((ab)*)" "<0><1>abababababab<2>ab<3>" + +# Non-greedy ?? quantifier +"(ab)(ab)??(ab)??(ab)??(ab)??c" "<0><1>ab<4>ab<5>abc" + +# Unicode Properties as naked elements in a pattern +"\p{Lu}+" "here we go ... <0>ABC and no more." +"(\p{L}+)(\P{L}*?) (\p{Zs}*)" "7999<0><1>letters<2>4949%^&*( <3> " + +# \w and \W +"\w+" " $%^&*( <0>hello123%^&*(" +"\W+" "<0> $%^&*( hello123%^&*(" + +# \A match at beginning of input only. + ".*\Ahello" "<0>hello hello" + ".*hello" "<0>hello hello" +".*\Ahello" "stuff\nhello" # don't match after embedded new-line. + +# \b \B +# +".*?\b(.).*" "<0> $%^&*( <1>hello123%^&*()gxx" +"\ba\b" "-<0>a" +"\by\b" "xy" +"[ \b]" "<0>b" # in a set, \b is a literal b. + +# Finds first chars of up to 5 words +"(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?(?:.*?\b(\w))?" "<0><1>Tthe <2>qick <3>brown <4>fox" + +"H.*?((?:\B.)+)" "<0>H<1>ello " +".*?((?:\B.)+).*?((?:\B.)+).*?((?:\B.)+)" "<0>H<1>ello <2> g<3>oodbye " + +"(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A\u0302BC\u0303\u0304<2> \u0305 \u0306<3>X\u0307Y\u0308" + + +# +# Unicode word boundary mode +# +"(?w).*?\b" v "<0>hello, world" +"(?w).*?(\b.+?\b).*" v "<0><1> 123.45 " +"(?w).*?(\b\d.*?\b).*" v "<0> <1>123.45 " +".*?(\b.+?\b).*" "<0> <1>123.45 " +"(?w:.*?(\b\d.*?\b).*)" v "<0> <1>123.45 " +"(?w:.*?(\b.+?\b).*)" v "<0><1>don't " +"(?w:.+?(\b\S.+?\b).*)" v "<0> <1>don't " +"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" v "<0><1>.<2> <3>,<4>:<5>$<6>37,000.50<7> " + +# +# Unicode word boundaries with Regions +# +"(?w).*?\b" v "abc<0>defghi" +"(?w).*?\b" v2 "abcdef<0>ghi" +"(?w).*?\b" v3 "abcdefghi" +#"(?w).*?\b" vb "abc<0>defghi" # TODO: bug. Ticket 6073 +#"(?w).*?\b" vb2 "abcdefghi" + + + +# . does not match new-lines +"." "\u000a\u000d\u0085\u000c\u000b\u2028\u2029<0>X\u000aY" +"A." "A\u000a "# no match + +# \d for decimal digits +"\d*" "<0>0123456789\u0660\u06F9\u0969\u0A66\u17E2\uFF10\U0001D7CE\U0001D7FFnon-digits" +"\D+" "<0>non digits" +"\D*(\d*)(\D*)" "<0>non-digits<1>3456666<2>more non digits" + +# \Q...\E quote mode +"hel\Qlo, worl\Ed" "<0>hello, world" +"\Q$*^^(*)?\A\E(a*)" "<0>$*^^(*)?\\A<1>aaaaaaaaaaaaaaa" +"[abc\Q]\r\E]+" "<0>aaaccc]]]\\\\\\\r..." # \Q ... \E escape in a [set] + +# UREGEX_LITERAL - entire pattern is a literal string, no escapes recognized. +# Note that data strings in test cases still get escape processing. +"abc\an\r\E\\abcd\u0031bye" Q "lead<0>abc\\an\\r\\E\\\\abcd\\u0031byeextra" +"case insensitive \\ (l)iteral" Qi "stuff!! <0>cAsE InSenSiTiVE \\\\ (L)ITeral" + +# \S and \s space characters +"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029xyz" +"(\S+).*?(\S+).*" "<0><1>Not-spaces <2>more-non-spaces " + +# \X consume one Grapheme Cluster. +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A<2>B<3> <4>\r\n" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>A\u0301<2>\n<3>\u0305<4>a\u0302\u0303\u0304" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1161\u11a8<2>\u115f\u11a2\u11f9" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\uac01<2>\uac02<3>\uac03\u11b0" +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\u1100\u1101\uac02\u0301<2>\u1100" +# Regional indicator pairs are grapheme clusters +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U0001f1e6\U0001f1e8<2>\U0001f1ea\U0001f1ff" +# Grapheme Break rule 9b: Prepend x +"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" v "<0><1>\U000111C2x" + +# Grapheme clusters that straddle a match region. Matching is pinned to the region limits, +# giving boundaries inside grapheme clusters +"(\X)?(\X)?(\X)?" v "a\u0301<0><1>\u0301\u0301<2>z\u0302\u0302\u0302" +# Same as previous test case, but without the region limits. +"(\X)?(\X)?(\X)?" v "<0><1>a\u0301\u0301\u0301<2>z\u0302\u0302\u0302" + +# ^ matches only at beginning of line +".*^(Hello)" "<0><1>Hello Hello Hello Hello Goodbye" +".*(Hello)" "<0>Hello Hello Hello <1>Hello Goodbye" +".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match + +# $ matches only at end of line, or before a newline preceding the end of line +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye" +".*?(Goodbye)" ZY "<0>Hello <1>Goodbye Goodbye Goodbye" +".*?(Goodbye)$" z "Hello Goodbye> Goodbye Goodbye "# No Match + +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye\r\n" +".*?(Goodbye)$" z "Hello Goodbye Goodbye Goodbye\n\n"# No Match + +# \Z matches at end of input, like $ with default flags. +".*?(Goodbye)\Z" zy "<0>Hello Goodbye Goodbye <1>Goodbye" +".*?(Goodbye)" ZY "<0>Hello <1>Goodbye Goodbye Goodbye" +".*?(Goodbye)\Z" z "Hello Goodbye> Goodbye Goodbye "# No Match +"here$" z "here\nthe end"# No Match + +".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\n" +".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye\r\n" +".*?(Goodbye)\Z" "Hello Goodbye Goodbye Goodbye\n\n"# No Match + +# \z matches only at the end of string. +# no special treatment of new lines. +# no dependencies on flag settings. +".*?(Goodbye)\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye" +".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye "# No Match +"here$" z "here\nthe end"# No Match + +".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye\n"# No Match +".*?(Goodbye)\n\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye\n" +"abc\z|def" ZY "abc<0>def" + +# (?# comment) doesn't muck up pattern +"Hello (?# this is a comment) world" " <0>Hello world..." + +# Check some implementation corner cases base on the way literal strings are compiled. +"A" "<0>A" +"AB" "<0>ABABABAB" +"AB+" "<0>ABBBA" +"AB+" "<0>ABABAB" +"ABC+" "<0>ABCABC" +"ABC+" "<0>ABCCCCABC" +"(?:ABC)+" "<0>ABCABCABCD" +"(?:ABC)DEF+" "<0>ABCDEFFFD" +"AB\.C\eD\u0666E" "<0>AB.C\u001BD\u0666EF" +"ab\Bde" "<0>abde" + +# loop breaking +"(a?)*" "<0><1>xyz" +"(a?)+" "<0><1>xyz" +"^(?:a?b?)*$" "a--" +"(x?)*xyz" "<0>xx<1>xyz" # Sligthtly weird, but correct. The "last" time through (x?), + # it matches the empty string. + +# Set expressions, basic operators and escapes work +# +"[\d]+" "<0>0123abc/.," +"[^\d]+" "0123<0>abc/.," +"[\D]+" "0123<0>abc/.," +"[^\D]+" "<0>0123abc/.," + +"[\s]+" "<0> \tabc/.," +"[^\s]+" " \t<0>abc/.," +"[\S]+" " \t<0>abc/.," +"[^\S]+" "<0> \tabc/.," + +"[\w]+" "<0>abc123 .,;" +"[^\w]+" "abc123<0> .,;" +"[\W]+" "abc123<0> .,;" +"[^\W]+" "<0>abc123 .,;" + +"[\z]+" "abc<0>zzzdef" # \z has no special meaning +"[^\z]+" "<0>abczzzdef" +"[\^]+" "abc<0>^^" +"[^\^]+" "<0>abc^^" + +"[\u0041c]+" "<0>AcAcdef" +"[\U00010002]+" "<0>\ud800\udc02\U00010003" +"[^\U00010002]+" "<0>Hello\x{10002}" +"[\x61b]+" "<0>ababcde" +#"[\x6z]+" "\x06" #TODO: single hex digits should fail +"[\x{9}\x{75}\x{6d6}\x{6ba6}\x{6146B}\x{10ffe3}]+" "<0>\u0009\u0075\u06d6\u6ba6\U0006146B\U0010ffe3abc" + +"[\N{LATIN CAPITAL LETTER TONE SIX}ab\N{VARIATION SELECTOR-70} ]+" "x<0> \u0184\U000E0135 abc" +"[\N{LATIN SMALL LETTER C}-\N{LATIN SMALL LETTER F}]+" "ab<0>cdefghi" + + + +# +# [set expressions], check the precedence of '-', '&', '--', '&&' +# '-' and '&', for compatibility with ICU UnicodeSet, have the same +# precedence as the implicit Union between adjacent items. +# '--' and '&&', for compatibility with Java, have lower precedence than +# the implicit Union operations. '--' and '&&' themselves +# have the same precedence, and group left to right. +# +"[[a-m]-[f-w]p]+" "<0>depfgwxyz" +"[^[a-m]-[f-w]p]+" "dep<0>fgwxyz" + +"[[a-m]--[f-w]p]+" "<0>depfgwxyz" +"[^[a-m]--[f-w]p]+" "de<0>pfgwxyz" + +"[[a-m]&[e-s]w]+" "<0>efmwadnst" +"[^[a-m]&[e-s]w]+" "efmw<0>adnst" + +"[[a-m]&[e-s]]+" "<0>efmadnst" + + + +# {min,max} iteration qualifier +"A{3}BC" "<0>AAABC" + +"(ABC){2,3}AB" "no matchAB" +"(ABC){2,3}AB" "ABCAB" +"(ABC){2,3}AB" "<0>ABC<1>ABCAB" +"(ABC){2,3}AB" "<0>ABCABC<1>ABCAB" +"(ABC){2,3}AB" "<0>ABCABC<1>ABCABCAB" + +"(ABC){2}AB" "ABCAB" +"(ABC){2}AB" "<0>ABC<1>ABCAB" +"(ABC){2}AB" "<0>ABC<1>ABCABCAB" +"(ABC){2}AB" "<0>ABC<1>ABCABCABCAB" + +"(ABC){2,}AB" "ABCAB" +"(ABC){2,}AB" "<0>ABC<1>ABCAB" +"(ABC){2,}AB" "<0>ABCABC<1>ABCAB" +"(ABC){2,}AB" "<0>ABCABCABC<1>ABCAB" + +"X{0,0}ABC" "<0>ABC" +"X{0,1}ABC" "<0>ABC" + +"(?:Hello(!{1,3}) there){1}" "Hello there" +"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>! there" +"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!! there" +"(?:Hello(!{1,3}) there){1}" "<0>Hello<1>!!! there" +"(?:Hello(!{1,3}) there){1}" "Hello!!!! there" + +# Nongreedy {min,max}? intervals +"(ABC){2,3}?AB" "no matchAB" +"(ABC){2,3}?AB" "ABCAB" +"(ABC){2,3}?AB" "<0>ABC<1>ABCAB" +"(ABC){2,3}?AB" "<0>ABC<1>ABCABCAB" +"(ABC){2,3}?AB" "<0>ABC<1>ABCABCABCAB" +"(ABC){2,3}?AX" "<0>ABCABC<1>ABCAX" +"(ABC){2,3}?AX" "ABC<0>ABCABC<1>ABCAX" + +# Possessive {min,max}+ intervals +"(ABC){2,3}+ABC" "ABCABCABC" +"(ABC){1,2}+ABC" "<0>ABC<1>ABCABC" +"(?:(.)\1){2,5}+." "<0>aabbcc<1>ddex" + + +# Atomic Grouping +"(?>.*)abc" "abcabcabc" # no match. .* consumed entire string. +"(?>(abc{2,4}?))(c*)" "<0><1>abcc<2>cccddd" +"(\.\d\d(?>[1-9]?))\d+" "1.625" +"(\.\d\d(?>[1-9]?))\d+" "1<0><1>.6250" + +# Possessive *+ +"(abc)*+a" "abcabcabc" +"(abc)*+a" "<0>abc<1>abcab" +"(a*b)*+a" "<0><1>aaaabaaaa" + +# Possessive ?+ +"c?+ddd" "<0>cddd" +"c?+cddd" "cddd" +"c?cddd" "<0>cddd" + +# Back Reference +"(?:ab(..)cd\1)*" "<0>ab23cd23ab<1>wwcdwwabxxcdyy" +"ab(?:c|(d?))(\1)" "<0>ab<1><2>c" +"ab(?:c|(d?))(\1)" "<0>ab<1>d<2>d" +"ab(?:c|(d?))(\1)" "<0>ab<1><2>e" +"ab(?:c|(d?))(\1)" "<0>ab<1><2>" + +# Back References that hit/don't hit end +"(abcd) \1" z "abcd abc" +"(abcd) \1" Z "<0><1>abcd abcd" +"(abcd) \1" Z "<0><1>abcd abcd " + +# Case Insensitive back references that hit/don't hit end. +"(abcd) \1" zi "abcd abc" +"(abcd) \1" Zi "<0><1>abcd ABCD" +"(abcd) \1" Zi "<0><1>abcd ABCD " + +# Back references that hit/don't hit boundary limits. + +"(abcd) \1" z "abcd abcd " +"(abcd) \1" Z "<0><1>abcd abcd " +"(abcd) \1" Z "<0><1>abcd abcd " + +"(abcd) \1" zi "abcd abcd " +"(abcd) \1" Zi "<0><1>abcd abcd " +"(abcd) \1" Zi "<0><1>abcd abcd " + +# Back reference that fails match near the end of input without actually hitting the end. +"(abcd) \1" ZL "abcd abd" +"(abcd) \1" ZLi "abcd abd" + +# Back reference to a zero-length match. They are always a successful match. +"ab(x?)cd(\1)ef" "<0>ab<1>cd<2>ef" +"ab(x?)cd(\1)ef" i "<0>ab<1>cd<2>ef" + +# Back refs to capture groups that didn't participate in the match. +"ab(?:(c)|(d))\1" "abde" +"ab(?:(c)|(d))\1" "<0>ab<1>cce" +"ab(?:(c)|(d))\1" i "abde" +"ab(?:(c)|(d))\1" i "<0>ab<1>cce" + +# Named back references +"(?abcd)\k" "<0><1>abcdabcd" +"(no)?(?abcd)\k" "<0><2>abcdabcd" + +"(?...)" E " " # backref names are ascii letters & numbers only" +"(?<1a>...)" E " " # backref names must begin with a letter" +"(?.)(?.)" E " " # Repeated names are illegal. + + +# Case Insensitive +"aBc" i "<0>ABC" +"a[^bc]d" i "ABD" +'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>AA" + +"(?:(?i)a)b" "<0>Ab" +"ab(?i)cd" "<0>abCd" +"ab$cd" "abcd" + +"ssl" i "abc<0>ßlxyz" +"ssl" i "abc<0>ẞlxyz" +"FIND" i "can <0>find ?" # fi ligature, \ufb01 +"find" i "can <0>FIND ?" +"ῧ" i "xxx<0>ῧxxx" # Composed char (match string) decomposes when case-folded (pattern) + +# White space handling +"a b" "ab" +"abc " "abc" +"abc " "<0>abc " +"ab[cd e]z" "<0>ab z" +"ab\ c" "<0>ab c " +"ab c" "<0>ab c " +"ab c" x "ab c " +"ab\ c" x "<0>ab c " + +# +# Pattern Flags +# +"(?u)abc" "<0>abc" +"(?-u)abc" "<0>abc" + +# +# \c escapes (Control-whatever) +# +"\cA" "<0>\u0001" +"\ca" "<0>\u0001" +"\c\x" "<0>\u001cx" + + +#Multi-line mode +'b\s^' m "a\nb\n" +"(?m)^abc$" "abc \n abc\n<0>abc\nabc" +"(?m)^abc$" 2 "abc \n abc\nabc\n<0>abc" +"^abc$" 2 "abc \n abc\nabc\nabc" + +# Empty and full range +"[\u0000-\U0010ffff]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz" +"[^\u0000-\U0010ffff]" "abc\u0000\uffff\U00010000\U0010ffffzz" +"[^a--a]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz" + +# Free-spacing mode +"a b c # this is a comment" x "<0>abc " +'^a (?#xxx) (?#yyy) {3}c' x "<0>aaac" +"a b c [x y z]" x "abc " +"a b c [x y z]" x "a b c " +"a b c [x y z]" x "<0>abcxyz" +"a b c [x y z]" x "<0>abcyyz" + +# +# Look Behind +# +"(?<=a)b" "a<0>b" +"(.*)(?<=[bc])" "<0><1>abcd" +"(?<=(abc))def" "<1>abc<0>def" # lookbehind precedes main match. +"(?<=ab|abc)xyz" "abwxyz" # ab matches, but not far enough. +"(?<=abc)cde" "abcde" +"(?<=abc|ab)cde" "ab<0>cde" +"(?<=abc|ab)cde" "abc<0>cde" + +"(?<=bc?c?c?)cd" "ab<0>cd" +"(?<=bc?c?c?)cd" "abc<0>cd" +"(?<=bc?c?c?)cd" "abcc<0>cd" +"(?<=bc?c?c?)cd" "abccc<0>cd" +"(?<=bc?c?c?)cd" "abcccccd" +"(?<=bc?c?c?)c+d" "ab<0>cccccd" + +".*(?<=: ?)(\w*)" "<0>1:one 2: two 3:<1>three " + +# +# Named Characters +# +"a\N{LATIN SMALL LETTER B}c" "<0>abc" +"a\N{LATIN SMALL LETTER B}c" i "<0>abc" +"a\N{LATIN SMALL LETTER B}c" i "<0>aBc" +"a\N{LATIN SMALL LETTER B}c" "aBc" + +"\N{FULL STOP}*" "<0>...abc" + +"$" "abc<0>" + +# +# Optimizations of .* at end of patterns +# +"abc.*" "<0>abcdef" +"abc.*$" "<0>abcdef" +"abc(.*)" "<0>abc<1>def" +"abc(.*)" "<0>abc<1>" +"abc.*" "<0>abc\ndef" +"abc.*" s "<0>abc\ndef" +"abc.*$" s "<0>abc\ndef" +"abc.*$" "abc\ndef" +"abc.*$" m "<0>abc\ndef" +"abc.*\Z" m "abc\ndef" +"abc.*\Z" sm "<0>abc\ndef" + +"abc*" "<0>abcccd" +"abc*$" "<0>abccc" +"ab(?:ab[xyz]\s)*" "<0>ababy abx abc" + +"(?:(abc)|a)(?:bc)+" "<0>abc" +"(?:(abc)|a)(?:bc)*" "<0><1>abc" +"^[+\-]?[0-9]*\.?[0-9]*" "<0>123.456" + +"ab.+yz" "<0>abc12345xyzttt" +"ab.+yz" s "<0>abc12345xyzttt" + +"ab.+yz" "abc123\n45xyzttt" +"ab.+yz" s "<0>abc12\n345xyzttt" + +"ab[0-9]+yz" "---abyz+++" +"ab[0-9]+yz" "---<0>ab1yz+++" +"ab[0-9]+yz" "---<0>ab12yz+++" +"ab[0-9]+yz" "---<0>ab123456yz+++" + +"ab([0-9]+|[A-Z]+)yz" "---abyz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>1yz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>12yz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>Ayz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>AByz+++" +"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>ABCDEyz+++" + +# +# Hex format \x escaping +# +"ab\x63" "<0>abc" +"ab\x09w" "<0>ab\u0009w" +"ab\xabcdc" "<0>ab\u00abcdc" +"ab\x{abcd}c" "<0>ab\uabcdc" +"ab\x{101234}c" "<0>ab\U00101234c" +"abα" "<0>abα" + +# +# Octal Escaping. This conforms to Java conventions, not Perl. +"\0101\00\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032" +"\0776" "<0>\u003f\u0036" # overflow, the 6 is literal. +"\0376xyz" "<0>\u00fexyz" +"\08" E "<0>\u00008" +"\0" E "x" + +# +# \u Surrogate Pairs +# +"\ud800\udc00" "<0>\U00010000" +"\ud800\udc00*" "<0>\U00010000\U00010000\U00010000\U00010001" +# TODO (Vitess): The next case has invalid UTF-8, so it's not supported right now for testing. It likely works in practice though! +# "\ud800\ud800\udc00" "<0>\ud800\U00010000\U00010000\U00010000\U00010001" +"(\ud800)(\udc00)" "\U00010000" +"\U00010001+" "<0>\U00010001\U00010001\udc01" + +# +# hitEnd with find() +# +"abc" Z "aa<0>abc abcab" +"abc" 2Z "aaabc <0>abcab" +"abc" 3z "aa>abc abcab" + +# +# \ escaping +# +"abc\jkl" "<0>abcjkl" # escape of a non-special letter is just itself. +"abc[ \j]kl" "<0>abcjkl" + +# +# \R all newline sequences. +# +"abc\Rxyz" "<0>abc\u000axyzgh" +"abc\Rxyz" "<0>abc\u000bxyzgh" +"abc\Rxyz" "<0>abc\u000cxyzgh" +"abc\Rxyz" "<0>abc\u000dxyzgh" +"abc\Rxyz" "<0>abc\u0085xyzgh" +"abc\Rxyz" "<0>abc\u2028xyzgh" +"abc\Rxyz" "<0>abc\u2029xyzgh" +"abc\Rxyz" "<0>abc\u000d\u000axyzgh" + +"abc\R\nxyz" "abc\u000d\u000axyzgh" # \R cannot match only the CR from a CR/LF sequence. +"abc\r\nxyz" "<0>abc\u000d\u000axyzgh" + +"abc\Rxyz" "abc\u0009xyz" # Assorted non-matches. +"abc\Rxyz" "abc\u000exyz" +"abc\Rxyz" "abc\u202axyz" + +# \v \V single character new line sequences. + +"abc\vxyz" "<0>abc\u000axyzgh" +"abc\vxyz" "<0>abc\u000bxyzgh" +"abc\vxyz" "<0>abc\u000cxyzgh" +"abc\vxyz" "<0>abc\u000dxyzgh" +"abc\vxyz" "<0>abc\u0085xyzgh" +"abc\vxyz" "<0>abc\u2028xyzgh" +"abc\vxyz" "<0>abc\u2029xyzgh" +"abc\vxyz" "abc\u000d\u000axyzgh" +"abc\vxyz" "abc?xyzgh" + +"abc[\v]xyz" "<0>abc\u000axyzgh" +"abc[\v]xyz" "<0>abc\u000bxyzgh" +"abc[\v]xyz" "<0>abc\u000cxyzgh" +"abc[\v]xyz" "<0>abc\u000dxyzgh" +"abc[\v]xyz" "<0>abc\u0085xyzgh" +"abc[\v]xyz" "<0>abc\u2028xyzgh" +"abc[\v]xyz" "<0>abc\u2029xyzgh" +"abc[\v]xyz" "abc\u000d\u000axyzgh" +"abc[\v]xyz" "abc?xyzgh" + +"abc\Vxyz" "abc\u000axyzgh" +"abc\Vxyz" "abc\u000bxyzgh" +"abc\Vxyz" "abc\u000cxyzgh" +"abc\Vxyz" "abc\u000dxyzgh" +"abc\Vxyz" "abc\u0085xyzgh" +"abc\Vxyz" "abc\u2028xyzgh" +"abc\Vxyz" "abc\u2029xyzgh" +"abc\Vxyz" "abc\u000d\u000axyzgh" +"abc\Vxyz" "<0>abc?xyzgh" + +# \h \H horizontal white space. Defined as gc=space_separator plus ascii tab + +"abc\hxyz" "<0>abc xyzgh" +"abc\Hxyz" "abc xyzgh" +"abc\hxyz" "<0>abc\u2003xyzgh" +"abc\Hxyz" "abc\u2003xyzgh" +"abc\hxyz" "<0>abc\u0009xyzgh" +"abc\Hxyz" "abc\u0009xyzgh" +"abc\hxyz" "abc?xyzgh" +"abc\Hxyz" "<0>abc?xyzgh" + +"abc[\h]xyz" "<0>abc xyzgh" +"abc[\H]xyz" "abc xyzgh" +"abc[\h]xyz" "<0>abc\u2003xyzgh" +"abc[\H]xyz" "abc\u2003xyzgh" +"abc[\h]xyz" "<0>abc\u0009xyzgh" +"abc[\H]xyz" "abc\u0009xyzgh" +"abc[\h]xyz" "abc?xyzgh" +"abc[\H]xyz" "<0>abc?xyzgh" + + +# +# Bug xxxx +# +"(?:\-|(\-?\d+\d\d\d))?(?:\-|\-(\d\d))?(?:\-|\-(\d\d))?(T)?(?:(\d\d):(\d\d):(\d\d)(\.\d+)?)?(?:(?:((?:\+|\-)\d\d):(\d\d))|(Z))?" MG "<0>-1234-21-31T41:51:61.789+71:81" + + +# +# A random, complex, meaningless pattern that should at least compile +# +"(?![^\\G)(?![^|\]\070\ne\{\t\[\053\?\\\x51\a\075\0023-\[&&[|\022-\xEA\00-\u41C2&&[^|a-\xCC&&[^\037\uECB3\u3D9A\x31\|\[^\016\r\{\,\uA29D\034\02[\02-\[|\t\056\uF599\x62\e\<\032\uF0AC\0026\0205Q\|\\\06\0164[|\057-\u7A98&&[\061-g|\|\0276\n\042\011\e\xE8\x64B\04\u6D0EDW^\p{Lower}]]]]?)(?<=[^\n\\\t\u8E13\,\0114\u656E\xA5\]&&[\03-\026|\uF39D\01\{i\u3BC2\u14FE]])(?<=[^|\uAE62\054H\|\}&&^\p{Space}])(?sxx)(?<=[\f\006\a\r\xB4]{1,5})|(?x-xd:^{5}+)()" "<0>abc" + + +# +# Bug 3225 + +"1|9" "<0>1" +"1|9" "<0>9" +"1*|9" "<0>1" +"1*|9" "<0>9" + +"(?:a|ac)d" "<0>acd" +"a|ac" "<0>ac" + +# +# Bug 3320 +# +"(a([^ ]+)){0,} (c)" "<0><1>a<2>b <3>c " +"(a([^ ]+))* (c)" "<0><1>a<2>b <3>c " + +# +# Bug 3436 +# +"(.*?) *$" "<0><1>test " + +# +# Bug 4034 +# +"\D" "<0>ABC\u00ffDEF" +"\d" "ABC\u00ffDEF" +"\D" "<0>\u00ffDEF" +"\d" "\u00ffDEF" +"\D" "123<0>\u00ffDEF" +"\D" "<0>\u0100DEF" +"\D" "123<0>\u0100DEF" + +# +#bug 4024, new line sequence handling +# +"(?m)^" "<0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"(?m)^" 2 "AA\u000d\u000a<0>BB\u000d\u000aCC\u000d\u000a" +"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0>CC\u000d\u000a" +"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +"(?m)$" "AA<0>\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"(?m)$" 2 "AA\u000d\u000aBB<0>\u000d\u000aCC\u000d\u000a" +"(?m)$" 3 "AA\u000d\u000aBB\u000d\u000aCC<0>\u000d\u000a" +"(?m)$" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0>" +"(?m)$" 5 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +"$" "AA\u000d\u000aBB\u000d\u000aCC<0>\u000d\u000a" +"$" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a<0>" +"$" 3 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +"$" "\u000a\u0000a<0>\u000a" +"$" 2 "\u000a\u0000a\u000a<0>" +"$" 3 "\u000a\u0000a\u000a" + +"$" "<0>" +"$" 2 "" + +"$" "<0>\u000a" +"$" 2 "\u000a<0>" +"$" 3 "\u000a" + +"^" "<0>" +"^" 2 "" + +"\Z" "<0>" +"\Z" 2 "" +"\Z" 2 "\u000a<0>" +"\Z" "<0>\u000d\u000a" +"\Z" 2 "\u000d\u000a<0>" + + +# No matching ^ at interior new-lines if not in multi-line mode. +"^" "<0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"^" 2 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +# +# Dot-matches-any mode, and stopping at new-lines if off. +# +"." "<0>123\u000aXYZ" +"." 2 "1<0>23\u000aXYZ" +"." 3 "12<0>3\u000aXYZ" +"." 4 "123\u000a<0>XYZ" # . doesn't match newlines +"." 4 "123\u000b<0>XYZ" +"." 4 "123\u000c<0>XYZ" +"." 4 "123\u000d<0>XYZ" +"." 4 "123\u000d\u000a<0>XYZ" +"." 4 "123\u0085<0>XYZ" +"." 4 "123\u2028<0>XYZ" +"." 4 "123\u2029<0>XYZ" +"." 4s "123<0>\u000aXYZ" # . matches any +"." 4s "123<0>\u000bXYZ" +"." 4s "123<0>\u000cXYZ" +"." 4s "123<0>\u000dXYZ" +"." 4s "123<0>\u000d\u000aXYZ" +"." 4s "123<0>\u0085XYZ" +"." 4s "123<0>\u2028XYZ" +"." 4s "123<0>\u2029XYZ" +".{6}" "123\u000a\u000dXYZ" +".{6}" s "<0>123\u000a\u000dXY" + + +# +# Ranges +# +".*" "abc<0>defghi" +"a" "aaa<0>aaaaaa" +"a" 2 "aaaa<0>aaaaa" +"a" 3 "aaaaa<0>aaaa" +"a" 4 "aaaaaaaaa" +"a" "aaa<0>aaaaaa" + +# +# [set] parsing, systematically run through all of the parser states. +# +# +"[def]+" "abc<0>ddeeffghi" # set-open +"[^def]+" "<0>abcdefghi" +"[:digit:]+" "abc<0>123def" +"[:^digit:]+" "<0>abc123def" +"[\u005edef]+" "abc<0>de^fghi" + +"[]]+" "abc<0>]]][def" # set-open2 +"[^]]+" "<0>abc]]][def" + +"[:Lu:]+" "abc<0>ABCdef" # set-posix +"[:Lu]+" "abc<0>uL::Lu" +"[:^Lu]+" "abc<0>uL:^:Lu" +"[:]+" "abc<0>:::def" +"[:whats this:]" E " " +"[--]+" dE "-------" + +"[[nested]]+" "xyz[<0>nnetsteed]abc" #set-start +"[\x{41}]+" "CB<0>AAZYX" +"[\[\]\\]+" "&*<0>[]\\..." +"[*({<]+" "^&<0>{{(<<*)))" + + +"[-def]+" "abc<0>def-ef-dxyz" # set-start-dash +"[abc[--def]]" E " " + +"[x[&def]]+" "abc<0>def&ghi" # set-start-amp +"[&& is bad at start]" E " " + +"[abc" E " " # set-after-lit +"[def]]" "abcdef" +"[def]]" "abcde<0>f]]" + +"[[def][ghi]]+" "abc]<0>defghi[xyz" # set-after-set +"[[def]ghi]+" "abc]<0>defghi[xyz" +"[[[[[[[[[[[abc]" E " " +"[[abc]\p{Lu}]+" "def<0>abcABCxyz" + +"[d-f]+" "abc<0>defghi" # set-after-range +"[d-f[x-z]]+" "abc<0>defxyzzzgw" +"[\s\d]+" "abc<0> 123def" +"[d-f\d]+" "abc<0>def123ghi" +"[d-fr-t]+" "abc<0>defrstuvw" + +"[abc--]" E " " # set-after-op +"[[def]&&]" E " " +"[-abcd---]+" "<0>abc--" #[-abcd]--[-] +"[&abcd&&&ac]+" "b<0>ac&&cad" #[&abcd]&&[&ac] + +"[[abcd]&[ac]]+" "b<0>acacd" # set-set-amp +"[[abcd]&&[ac]]+" "b<0>acacd" +"[[abcd]&&ac]+" "b<0>acacd" +"[[abcd]&ac]+" "<0>bacacd&&&" + +"[abcd&[ac]]+" "<0>bacacd&&&" #set-lit-amp +"[abcd&&[ac]]+" "b<0>acacd" +"[abcd&&ac]+" "b<0>acacd" + +"[[abcd]-[ac]]+" "a<0>bdbdc" # set-set-dash +"[[abcd]--[ac]]+" "a<0>bdbdc" +"[[abcd]--ac]+" "a<0>bdbdc" +"[[abcd]-ac]+" "<0>bacacd---" + +"[a-d--[b-c]]+" "b<0>adadc" # set-range-dash +"[a-d--b-c]+" "b<0>adadc" +"[a-d-[b-c]]+" "<0>bad-adc" +"[a-d-b-c]+" "<0>bad-adc" +"[\w--[b-c]]+" "b<0>adadc" +"[\w--b-c]+" "b<0>adadc" +"[\w-[b-c]]+" "<0>bad-adc" +"[\w-b-c]+" "<0>bad-adc" + +"[a-d&&[b-c]]+" "a<0>bcbcd" # set-range-amp +"[a-d&&b-c]+" "a<0>bcbcd" +"[a-d&[b-c]]+" "<0>abc&bcd" +"[a-d&b-c]+" "<0>abc&bcd" + +"[abcd--bc]+" "b<0>addac" # set-lit-dash +"[abcd--[bc]]+" "b<0>addac" +"[abcd-[bc]]+" "<0>bad--dacxyz" +"[abcd-]+" "<0>bad--dacxyz" + +"[abcd-\s]+" E "xyz<0>abcd --xyz" # set-lit-dash-esc +"[abcd-\N{LATIN SMALL LETTER G}]+" "xyz-<0>abcdefghij-" +"[bcd-\{]+" "a<0>bcdefyz{|}" + +"[\p{Ll}]+" "ABC<0>abc^&*&" # set-escape +"[\P{Ll}]+" "abc<0>ABC^&*&xyz" +"[\N{LATIN SMALL LETTER Q}]+" "mnop<0>qqqrst" +"[\sa]+" "cb<0>a a (*&" +"[\S]+" " <0>hello " +"[\w]+" " <0>hello_world! " +"[\W]+" "a<0> *$%#,hello " +"[\d]+" "abc<0>123def" +"[\D]+" "123<0>abc567" +"[\$\#]+" "123<0>$#$#\\" + +# +# Try each of the Java compatibility properties. +# These are checked here, while normal Unicode properties aren't, because +# these Java compatibility properties are implemented directly by regexp, while other +# properties are handled by ICU's Property and UnicodeSet APIs. +# +# These tests are only to verify that the names are recognized and the +# implementation isn't dead. They are not intended to verify that the +# function definitions are 100% correct. +# +"[:InBasic Latin:]+" "ΓΔΕΖΗΘ<0>hello, world.ニヌネノハバパ" +"[:^InBasic Latin:]+" "<0>ΓΔΕΖΗΘhello, world.ニヌネノハバパ" +"\p{InBasicLatin}+" "ΓΔΕΖΗΘ<0>hello, world.ニヌネノハバパ" +"\P{InBasicLatin}+" "<0>ΓΔΕΖΗΘhello, world.ニヌネノハバパ" +"\p{InGreek}+" "<0>ΓΔΕΖΗΘhello, world.ニヌネノハバパ" +"\p{InCombining Marks for Symbols}" "<0>\u20d0" +"\p{Incombiningmarksforsymbols}" "<0>\u20d0" + + +"\p{javaDefined}+" "\uffff<0>abcd\U00045678" +"\p{javaDigit}+" "abc<0>1234xyz" +"\p{javaIdentifierIgnorable}+" "abc<0>\u0000\u000e\u009fxyz" +"\p{javaISOControl}+" "abc<0>\u0000\u000d\u0083xyz" +"\p{javaJavaIdentifierPart}+" "#@!<0>abc123_$;" +"\p{javaJavaIdentifierStart}+" "123\u0301<0>abc$_%^&" +"\p{javaLetter}+" "123<0>abcDEF&*()(" +"\p{javaLetterOrDigit}+" "$%^&*<0>123abcகஙசஜஞ☺♘♚☔☎♬⚄⚡" +"\p{javaLowerCase}+" "ABC<0>def&^%#:=" +"\p{javaMirrored}+" "ab$%<0>(){}[]xyz" +"\p{javaSpaceChar}+" "abc<0> \u00a0\u2028!@#" +"\p{javaSupplementaryCodePoint}+" "abc\uffff<0>\U00010000\U0010ffff\u0000" +"\p{javaTitleCase}+" "abCE<0>Džῌᾨ123" +"\p{javaUnicodeIdentifierStart}+" "123<0>abcⅣ%^&&*" +"\p{javaUnicodeIdentifierPart}+" "%&&^<0>abc123\u0301\u0002..." +"\p{javaUpperCase}+" "abc<0>ABC123" +"\p{javaValidCodePoint}+" "<0>\u0000abc\ud800 unpaired \udfff |\U0010ffff" +"\p{javaWhitespace}+" "abc\u00a0\u2007\u202f<0> \u0009\u001c\u001f\u202842" +"\p{all}+" "<0>123\u0000\U0010ffff" +"\P{all}+" "123\u0000\U0010ffff" + +# [:word:] is implemented directly by regexp. Not a java compat property, but PCRE and others. + +"[:word:]+" ".??$<0>abc123ΓΔΕΖΗ_%%%" +"\P{WORD}+" "<0>.??$abc123ΓΔΕΖΗ_%%%" + +# +# Errors on unrecognized ASCII letter escape sequences. +# +"[abc\Y]+" "<0>abcY" +"[abc\Y]+" eE "<0>abcY" + +"(?:a|b|c|\Y)+" "<0>abcY" +"(?:a|b|c|\Y)+" eE "<0>abcY" + +"\Q\Y\E" e "<0>\\Y" + +# +# Reported problem +# +"[a-\w]" E "x" + +# +# Bug 4045 +# +"A*" "<0>AAAA" +"A*" 2 "AAAA<0>" +"A*" 3 "AAAA" +"A*" 4 "AAAA" +"A*" 5 "AAAA" +"A*" 6 "AAAA" +"A*" "<0>" +"A*" 2 "" +"A*" 3 "" +"A*" 4 "" +"A*" 5 "" + +# +# Bug 4046 +# +"(?m)^" "<0>AA\u000dBB\u000dCC\u000d" +"(?m)^" 2 "AA\u000d<0>BB\u000dCC\u000d" +"(?m)^" 3 "AA\u000dBB\u000d<0>CC\u000d" +"(?m)^" 4 "AA\u000dBB\u000dCC\u000d" +"(?m)^" 5 "AA\u000dBB\u000dCC\u000d" +"(?m)^" 6 "AA\u000dBB\u000dCC\u000d" + +"(?m)^" "<0>AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" +"(?m)^" 2 "AA\u000d\u000a<0>BB\u000d\u000aCC\u000d\u000a" +"(?m)^" 3 "AA\u000d\u000aBB\u000d\u000a<0>CC\u000d\u000a" +"(?m)^" 4 "AA\u000d\u000aBB\u000d\u000aCC\u000d\u000a" + +# +# Bug 4059 +# +"\w+" "<0>イチロー" +"\b....\b." "<0>イチロー?" + + +# +# Bug 4058 ICU Unicode Set patterns have an odd feature - +# A $ as the last character before the close bracket means match +# a \uffff, which means off the end of the string in transliterators. +# Didn't make sense for regular expressions, and is now fixed. +# +"[\$](P|C|D);" "<0>$<1>P;" +"[$](P|C|D);" "<0>$<1>P;" +"[$$](P|C|D);" "<0>$<1>P;" + +# +# bug 4888 Flag settings lost in some cases. +# +"((a){2})|(#)" is "no" +"((a){2})|(#)" is "<0><1>a<2>a#" +"((a){2})|(#)" is "a<0><3>#" + +"((a|b){2})|c" is "<0>c" +"((a|b){2})|c" is "<0>C" +"((a|b){2})|c" s "C" + +# +# bug 5617 ZWJ \u200d shouldn't cause word boundaries +# +".+?\b" "<0> \u0935\u0915\u094D\u200D\u0924\u0947 " +".+?\b" 2 " <0>\u0935\u0915\u094D\u200D\u0924\u0947 " +".+?\b" 3 " \u0935\u0915\u094D\u200D\u0924\u0947 " + +# +# bug 5386 "^.*$" should match empty input +# +"^.*$" "<0>" +"^.*$" m "<0>" +"^.*$" "<0>\n" +"(?s)^.*$" "<0>\n" + +# +# bug 5386 Empty pattern and empty input should match. +# +"" "<0>abc" +"" "<0>" + +# +# bug 5386 Range upper and lower bounds can be equal +# +"[a-a]" "<0>a" + +# +# bug 5386 $* should not fail, should match empty string. +# +"$*" "<0>abc" + +# +# bug 5386 \Q ... \E escaping problem +# +"[a-z\Q-$\E]+" "QE<0>abc-def$." + +# More reported 5386 Java comaptibility failures +# +"[^]*abb]*" "<0>kkkk" +"\xa" "huh" # Java would like to be warned. +"^.*$" "<0>" + +# +# bug 5386 Empty left alternation should produce a zero length match. +# +"|a" "<0>a" +"$|ab" "<0>ab" +"$|ba" "ab<0>" + +# +# bug 5386 Java compatibility for set expressions +# +"[a-z&&[cde]]+" "ab<0>cdefg" + +# +# bug 6019 matches() needs to backtrack and check for a longer match if the +# first match(es) found don't match the entire input. +# +"a?|b" "<0>b" +"a?|b" M "<0>b" +"a?|.*?u|stuff|d" M "<0>stuff" +"a?|.*?(u)|stuff|d" M "<0>stuff<1>u" +"a+?" "<0>aaaaaaaaaaaaa" +"a+?" M "<0>aaaaaaaaaaaaa" + +# +# Bug 7724. Expression to validate zip codes. +# +"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "<0><1>94040<2>-3344" +"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "94040-0000" +"(?!0{5})(\d{5})(?!-?0{4})(-?\d{4})?" "00000-3344" + +# +# Bug 8666. Assertion failure on match, bad operand to JMP_SAV_X opcode. +# +"((.??)+|A)*" "<0><1><2>AAAAABBBBBCCCCCDDDDEEEEE" + +# +# Bug 8826. Incorrect results with case insensitive matches. +# +"AS(X)" i "aßx" +"AS.*" i "aßx" # Expansion of sharp s can't split between pattern terms. +"ASßS" i "<0>aßß" # All one literal string, does match. +"ASß{1}S" i "aßß" # Pattern with terms, no match. +"aßx" i "<0>assx" +"aßx" i "<0>ASSX" +"aßx" i "<0>aßx" +"ASS(.)" i "<0>aß<1>x" + +# Case Insensitive, probe some corner cases. +"ass+" i "aß" # Second 's' in pattern is qualified, can't combine with first. +"as+" i "aß" +"aßs" i "as" # Can't match half of a ß +"aß+" i "<0>asssssssss" +"aß+" i "<0>assßSssSSSs" +"a(ß?)+" i "<0>assssssss<1>s" +"a(ß?)+" i "<0>a<1>zzzzzzzzs" + +"\U00010400" i "<0>\U00010428" # case folded supplemental code point. + +"sstuff" i "<0>ßtuff" # exercise optimizations on what chars can start a match. +"sstuff" i "s<0>ßtuff" # exercise optimizations on what chars can start a match. +"ßtuff" i "s<0>sstuff" +"ßtuff" i "s<0>Sstuff" + +"a(..)\1" i "<0>A<1>bcBCdef" +"(ß)\1" i "aa<0><1>ssßzz" # Case insensitive back reference +"..(.)\1" i "<0>aa<1>ßss" +"ab(..)\1" i "xx<0>ab<1>ssßss" + +" (ss) ((\1.*)|(.*))" i "<0> <1>ss <2><4>sß" # The back reference 'ss' must not match in 'sß' + +# Bug 9057 +# \u200c and \u200d should be word characters. +# +"\w+" " <0>abc\u200cdef\u200dghi " +"\w+" i " <0>abc\u200cdef\u200dghi " +"[\w]+" " <0>abc\u200cdef\u200dghi " +"[\w]+" i " <0>abc\u200cdef\u200dghi " + +# Bug 9283 +# uregex_open fails for look-behind assertion + case-insensitive + +"(ab)?(?<=ab)cd|ef" i "<0><1>abcd" + +# Bug 9719 Loop breaking on (zero length match){3,} (unlimited upper bound). +# + +"(?:abc){1,}abc" "<0>abcabcabcabcabc" +"(?:2*){2,}?a2\z" "<0>2a2" +"(?:2*){2,}?a2\z" "2a3" +"(?:x?+){3,}+yz" "w<0>yz" +"(2*){2,}?a2\\z" "2a3" +"(2*){2,}?a2\\z" "<0>2<1>a2\\z" +"(2*){2,}?a2\z" "<0>2<1>a2" + + +# Bug 10024 +# Incorrect (unbounded) longest match length with {1, 20} style quantifiers. +# Unbounded match is disallowed in look-behind expressions. +# Max match length is used to limit where to check for look-behind matches. + +"(?<=a{1,5})bc" "aaaa<0>bcdef" +"(?<=(?:aa){3,20})bc" "aaaaaa<0>bcdef" +"(?jkl" +"(?<=a{11})bc" "aaaaaaaaaaa<0>bc" +"(?<=a{11})bc" "aaaaaaaaaabc" +"(?<=a{1,})bc" E "aaaa<0>bcdef" # U_REGEX_LOOK_BEHIND_LIMIT error. +"(?<=(?:){11})bc" "<0>bc" # Empty (?:) expression. + +# Bug 10835 +# Match Start Set not being correctly computed for case insensitive patterns. +# (Test here is to dump the compiled pattern & manually check the start set.) + +"(private|secret|confidential|classified|restricted)" i "hmm, <0><1>Classified stuff" +"(private|secret|confidential|classified|restricted)" "hmm, Classified stuff" + +# Bug 10844 + +"^([\w\d:]+)$" "<0><1>DiesIst1Beispiel:text" +"^([\w\d:]+)$" i "<0><1>DiesIst1Beispiel:text" +"^(\w+\d\w+:\w+)$" "<0><1>DiesIst1Beispiel:text" +"^(\w+\d\w+:\w+)$" i "<0><1>DiesIst1Beispiel:text" + +# Bug 11049 +# Edge cases in find() when pattern match begins with set of code points +# and the match begins at the end of the string. + +"A|B|C" "hello <0>A" +"A|B|C" "hello \U00011234" +"A|B|\U00012345" "hello <0>\U00012345" +"A|B|\U00010000" "hello \ud800" + +# Bug 11369 +# Incorrect optimization of patterns with a zero length quantifier {0} + +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE" +"(|b)ab(c)" "<0><1>ab<2>c" +"(|b){0}a{3}(D*)" "<0>aaa<2>" +"(|b){0,1}a{3}(D*)" "<0><1>aaa<2>" +"((|b){0})a{3}(D*)" "<0><1>aaa<3>" + +# Bug 11370 +# Max match length computation of look-behind expression gives result that is too big to fit in the +# in the 24 bit operand portion of the compiled code. Expressions should fail to compile +# (Look-behind match length must be bounded. This case is treated as unbounded, an error.) + +"(?pre<1>\ud800post\ud800 fin" +"pre(.)post\1" i "pre\ud800post\ud800\udc00" # case insensiteve backrefs take a different code path +"pre(.)post\1" i "<0>pre<1>\ud800post\ud800 fin" + +# Bug 11554 +# +# Maximum match length computation was assuming UTF-16. +# Used in look-behind matches to constrain how far back to look. + +"(?<=a\x{100000})spam" "***a\x{100000}<0>spam**" +"(?<=aą)spam" "**aą<0>spam**" +"(?<=ąabc)spam" "**ąabc<0>spam**" + +"(?<=a\x{100000})spam" "***a\x{100001}spam**" +"(?<=aą)spam" "**bąspam**" +"(?<=ąabc)spam" "**ąabxspam**" + +# with negative look-behind + +"(?spam**" +"(?spam**" +"(?spam**" + +# Bug #12930 +# +# Minimum Match Length computation, int32_t overflow on an empty set in the pattern. +# The empty set, with no match possible, has a min match length of INT32_MAX. +# Was incremented subsequently. Caused assertion failure on pattern compile. + +"[^\u0000-\U0010ffff]bc?" "bc no match" +"[^\u0000-\U0010ffff]?bc?" "<0>bc has a match" + +# Bug #12160 Hit End behavior after find fails to find. +# To match Java, should be true if find fails to find. +# +"abc" Z "<0>abc abc abc xyz" +"abc" Z2 "abc <0>abc abc xyz" +"abc" Z3 "abc abc <0>abc xyz" +"abc" z4 "abc abc abc xyz" + +# Bug #13844 Verify that non-standard Java property names are recognized. +"[\p{IsAlphabetic}]" " <0>A" +"[\P{IsAlphabetic}]" "A<0> " +"[\p{IsIdeographic}]" "A<0>〆" +"[\P{IsIdeographic}]" "〆<0>A" +"[\p{IsLetter}]" " <0>A" +"[\P{IsLetter}]" "A<0> " +"[\p{Letter}]" " <0>A" +"[\p{IsLowercase}]" "A<0>a" +"[\P{IsLowercase}]" "a<0>A" +"[\p{IsUppercase}]" "a<0>A" +"[\P{IsUppercase}]" "A<0>a" +"[\p{IsTitlecase}]" "D<0>Dz" +"[\P{IsTitlecase}]" "Dz<0>D" +"[\p{IsPunctuation}]" " <0>&" +"[\P{IsPunctuation}]" "&<0> " +"[\p{IsControl}]" " <0>\x{82}" +"[\P{IsControl}]" "\x{82}<0> " +"[\p{IsWhite_Space}]" "x<0> " +"[\P{IsWhite_Space}]" " <0>x" +"[\p{IsDigit}]" " <0>4" +"[\P{IsDigit}]" "4<0> " +"[\p{IsHex_Digit}]" " <0>F" +"[\P{IsHex_Digit}]" "F<0> " +"[\p{IsJoin_Control}]" " <0>\x{200d}" +"[\P{IsJoin_Control}]" "\x{200d}<0> " +"[\p{IsNoncharacter_Code_Point}]" "A<0>\x{5fffe}" +"[\p{IsAssigned}]" "\x{10ffff}<0>a" +"[\P{IsAssigned}]" "a<0>\x{10ffff}" + +"[\p{InBasic Latin}]" "〆<0>A" +"[\p{InBasicLatin}]" "〆<0>A" +"[\p{InBasic-Latin}]" "〆<0>A" # ICU accepts '-'; Java does not. +"[\p{InBasic_Latin}]" "〆<0>A" +"[\p{Inbasiclatin}]" "〆<0>A" +"[\p{inbasiclatin}]" E "〆<0>A" # "In" must be cased as shown. Property name part is case insensitive. +"[\p{InCombining_Marks_for_Symbols}]" "a<0>\x{20DD}" # COMBINING ENCLOSING CIRCLE + +"[\p{all}]*" "<0>\x{00}abc\x{10ffff}" +"[\p{javaBadProperty}]" E "whatever" +"[\p{IsBadProperty}]" E "whatever" +"[\p{InBadBlock}]" E "whatever" +"[\p{In}]" E "whatever" +"[\p{Is}]" E "whatever" +"[\p{java}]" "x<0>ꦉ" # Note: "java" is a valid script code. + +"[\p{javaLowerCase}]+" "A<0>a" +"[\p{javaLowerCase}]+" i "<0>Aa" +"[\P{javaLowerCase}]+" "<0>Aa" +"[\P{javaLowerCase}]+" i "Aa" # No Match because case fold of the set happens first, then negation. + # JDK is not case insensitive w named properties, even though + # the insensitive match flag is set. A JDK bug? + +"[a-z]+" i "<0>Aa" # Matches JDK behavior. +"[^a-z]+" i "Aa" # (no match) which is JDK behavior. Case fold first, then negation. + +# Bug 20385. Assertion failure while compiling a negative look-behind expression consisting of a set with +# no contents. Meaning the [set] can never match. There is no syntax to directly express +# an empty set, so generate it by negating (^) a set of all code points. +# Also check empty sets in other contexts. + +"(?abc" + +"(?abc" +"x(?xabc" +"x(?xabc" +"x(?xabc" + +"[^\u0000-\U0010ffff]" "a" +"[^[^\u0000-\U0010ffff]]" "<0>a" + +"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings" + +# Bug ICU-20544. Similar to 20385, above. Assertion failure with a negative look-behind assertion containing +# a set with no contents. Look-behind pattern includes more than just the empty set. + +"(?abc" # note: first 'ⰿ' is \u2c3f, hence empty set. +"(?abc" +"(?<=[^[^]]†)" "abc" # Problem also exists w positive look-behind + +# Bug ICU-20391. Crash in computation of minimum match length with nested look-around patterns. +# +"(?<=(?<=((?=)){0}+)" E "aaa" +"(?<=(?<=((?=)){0}+))" "<0>" +"(?<=c(?<=b((?=a)){1}+))" "aaa" +"abc(?=de(?=f))...g" "<0>abcdefg" +"abc(?=de(?=f))...g" "abcdxfg" + +# Bug ICU-20618 Assertion failure with nested look-around expressions. +# +"(?<=(?<=b?(?=a)))" "hello, world." + +# Bug ICU-20939 +# Incorrect word \b boundaries w UTF-8 input and non-ASCII text +# +"(?w)\b" v2 "äää<0> äää" + +# Bug ICU-21492 Assertion failure with nested look-around expressions. +# +"(?<=(?:(?<=(?:(?<=(?:(?<=)){2})){3})){4}" E "<0>" # orig failure from bug report, w mismatched parens. +"(?:(?<=(?:(?<=)){2}))" "<0>" # Simplified case, with a valid pattern. + +# Random debugging, Temporary +# + +# +# Regexps from http://www.regexlib.com +# +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" G "<0>G1 1AA" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" G "<0>EH10 2QQ" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" G "<0>SW1 1ZZ" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" "G111 1AA" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" "X10 WW" +"^[a-zA-Z]{1,2}[0-9][0-9A-Za-z]{0,1} {0,1}[0-9][A-Za-z]{2}$" "DDD 5WW" +#"^[\w\-]+(?:\.[\w\-]+)*@(?:[\w\-]+\.)+[a-zA-Z]{2,7}$" dG "<0>joe.tillis@unit.army.mil" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" G "<0>jack_rabbit@slims.com" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" G "<0>foo99@foo.co.uk" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" "find_the_mistake.@foo.org" # TODO: \w in pattern +#"^[\w-]+(?:\.[\w-]+)*@(?:[\w-]+\.)+[a-zA-Z]{2,7}$" ".prefix.@some.net" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" G "<0>asmith@mactec.com" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" G "<0>foo12@foo.edu" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" G "<0>bob.smith@foo.tv" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" "joe" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" "@foo.com" +"^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$" "a@a" +"^\d{1,2}\/\d{1,2}\/\d{4}$" G "<0>4/1/2001" +"^\d{1,2}\/\d{1,2}\/\d{4}$" G "<0>12/12/2001" +"^\d{1,2}\/\d{1,2}\/\d{4}$" G "<0>55/5/3434" +"^\d{1,2}\/\d{1,2}\/\d{4}$" "1/1/01" +"^\d{1,2}\/\d{1,2}\/\d{4}$" "12 Jan 01" +"^\d{1,2}\/\d{1,2}\/\d{4}$" "1-1-2001" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>01.1.02" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>11-30-2001" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>2/29/2000" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "02/29/01" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "13/01/2002" +"^(?:(?:(?:0?[13578]|1[02])(\/|-|\.)31)\1|(?:(?:0?[1,3-9]|1[0-2])(\/|-|\.)(?:29|30)\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:0?2(\/|-|\.)29\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:(?:0?[1-9])|(?:1[0-2]))(\/|-|\.)(?:0?[1-9]|1\d|2[0-8])\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "11/00/02" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" G "<0>127.0.0.1" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" G "<0>255.255.255.0" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" G "<0>192.168.0.1" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" "1200.5.4.3" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" "abc.def.ghi.jkl" +"^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$" "255.foo.bar.1" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" G "<0>COM1" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" G "<0>AUX" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" G "<0>LPT1" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" "image.jpg" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" "index.html" +"(AUX|PRN|NUL|COM\d|LPT\d)+\s*$" "readme.txt" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>29/02/1972" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>5-9-98" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" G "<0>10-11-2002" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "29/02/2003" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "12/13/2002" +"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$" "1-1-1500" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" G "<0>user=foo,bar,quux;group=manager,admin;level=100;" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" G "<0>group=nobody;level=24;" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" "user=foo" +"^(user=([a-z0-9]+,)*(([a-z0-9]+){1});)?(group=([a-z0-9]+,)*(([a-z0-9]+){1});)?(level=[0-9]+;)?$" "blahh" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" G "<0>(+44)(0)20-12341234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" G "<0>02012341234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" G "<0>+44 (0) 1234-1234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" "(44+)020-12341234" +"^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$" "12341234(+020)" +"\b(\w+)\s+\1\b" G "<0>Tell the the preacher" +"\b(\w+)\s+\1\b" G "<0>some some" +"\b(\w+)\s+\1\b" G "<0>hubba hubba" +"\b(\w+)\s+\1\b" "once an annual report" +"\b(\w+)\s+\1\b" "mandate dated submissions" +"\b(\w+)\s+\1\b" "Hubba hubba" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" G "<0>+31235256677" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" G "<0>+31(0)235256677" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" G "<0>023-5256677" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" "+3123525667788999" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" "3123525667788" +"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" "232-2566778" +"^[-+]?\d*\.?\d*$" G "<0>123" +"^[-+]?\d*\.?\d*$" G "<0>+3.14159" +"^[-+]?\d*\.?\d*$" G "<0>-3.14159" +"^[-+]?\d*\.?\d*$" "abc" +"^[-+]?\d*\.?\d*$" "3.4.5" +"^[-+]?\d*\.?\d*$" "$99.95" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" G "<0>$1,234.50" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" G "<0>$0.70" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" G "<0>.7" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" "$0,123.50" +"^\$?([1-9]{1}[0-9]{0,2}(\,[0-9]{3})*(\.[0-9]{0,2})?|[1-9]{1}[0-9]{0,}(\.[0-9]{0,2})?|0(\.[0-9]{0,2})?|(\.[0-9]{1,2})?)$" "$00.5" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" G "<0>AB123456D" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" G "<0>AB123456F" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" G "<0>AB123456M" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" "AB123456E" +"^[A-Z]{2}[0-9]{6}[A-DFM]{1}$" "ab123456d" +#"(http|ftp|https):\/\/[\w]+(.[\w]+)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" G "<0>http://regxlib.com/Default.aspx" # TODO: \w in pattern +#"(http|ftp|https):\/\/[\w]+(.[\w]+)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" G "<0>http://electronics.cnet.com/electronics/0-6342366-8-8994967-1.html" # TODO: \w in pattern +#"(http|ftp|https):\/\/[\w]+(.[\w]+)([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" "www.yahoo.com" # TODO: \w in pattern +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" G "<0>2034AK" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" G "<0>2034 AK" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" G "<0>2034 ak" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" "2034 AK" +"^[0-9]{4}\s{0,1}[a-zA-Z]{2}$" "321321 AKSSAA" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" G "<0>4/5/91" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" G "<0>04/5/1991" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" G "<0>4/05/89" +"((\d{2})|(\d))\/((\d{2})|(\d))\/((\d{4})|(\d{2}))" "4/5/1" +#"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" G "<0>01/01/2001 " #TODO - \s in pattern. +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" G "<0>01-01-2001:" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" G "<0>(1-1-01)" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" "13/1/2001" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" "1-32-2001" +"(^|\s|\()((([1-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-]((2[0-9]){1}|(3[01]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:])|(^|\s|\()((([0-9]){1}|([0][1-9]){1}|([1][012]){1}){1}[\/-](([11-31]){1}|([01][1-9]){1}|([1-9]){1}){1}[\/-](((19|20)([0-9][0-9]){1}|([0-9][0-9]){1})){1}(([\s|\)|:|$|\>])){1}){1}){1}){1}" "1-1-1801" +"^\d{3}\s?\d{3}$" G "<0>400 099" +"^\d{3}\s?\d{3}$" G "<0>400099" +"^\d{3}\s?\d{3}$" G "<0>400050" +"^\d{3}\s?\d{3}$" "2345678" +"^\d{3}\s?\d{3}$" "12345" +"^\d{3}\s?\d{3}$" "asdf" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" G "<0>(111) 222-3333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" G "<0>1112223333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" G "<0>111-222-3333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" "11122223333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" "11112223333" +"^\D?(\d{3})\D?\D?(\d{3})\D?(\d{4})$" "11122233333" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" G "<0>#00ccff" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" G "<0>#039" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" G "<0>ffffcc" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" "blue" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" "0x000000" +"^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$" "#ff000" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" G "<0>01:23:45:67:89:ab" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" G "<0>01:23:45:67:89:AB" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" G "<0>fE:dC:bA:98:76:54" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" "01:23:45:67:89:ab:cd" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" "01:23:45:67:89:Az" +"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$" "01:23:45:56:" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>http://www.blah.com/~joe" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>ftp://ftp.blah.co.uk:2828/blah%20blah.gif" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>https://blah.gov/blah-blah.as" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "www.blah.com" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "http://www.blah.com/I have spaces!" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "ftp://blah_underscore/[nope]" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002 12:32:10" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "32/12/2002" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "12/13/2001" +"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "12/02/06" +"^[0-9](\.[0-9]+)?$" G "<0>1.2345" +"^[0-9](\.[0-9]+)?$" G "<0>0.00001" +"^[0-9](\.[0-9]+)?$" G "<0>7" +"^[0-9](\.[0-9]+)?$" "12.2" +"^[0-9](\.[0-9]+)?$" "1.10.1" +"^[0-9](\.[0-9]+)?$" "15.98" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" G "<0>III" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" G "<0>xiv" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" G "<0>MCMLXLIX" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" "iiV" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" "MCCM" +"^(?:[mM]{1,3})?(?:(?:[cC][dDmM])|(?:[dD]?(?:[cC]{1,3})?))?[lL]?(([xX])(?:\2{1,2}|[lL]|[cC])?)?((([iI])((\5{1,2})|[vV]|[xX]|[lL])?)|([vV]?([iI]{1,3})?))?$" "XXXX" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" G "<0>123" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" G "<0>-123.35" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" G "<0>-123.35e-2" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" "abc" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" "123.32e" +"^[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?$" "123.32.3" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" G "<0>T.F. Johnson" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" G "<0>John O'Neil" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" G "<0>Mary-Kate Johnson" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" "sam_johnson" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" "Joe--Bob Jones" +"^[a-zA-Z]+(([\'\,\.\- ][a-zA-Z ])?[a-zA-Z]*)*$" "dfjsd0rd" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" G "<0>1200" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" G "<0>1645" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" G "<0>2359" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" "2400" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" "asbc" +"^(20|21|22|23|[0-1]\d)[0-5]\d$" "12:45" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ G '<0>' +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ G "<0>" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ G "<0>" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ "= img.jpg" +/<[^>]*\n?.*=("|')?(.*\.jpg)("|')?.*\n?[^<]*>/ "img.jpg" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" G "<0>78754" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" G "<0>78754-1234" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" G "<0>G3H 6A3" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" "78754-12aA" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" "7875A" +"^(\d{5}-\d{4}|\d{5})$|^([a-zA-Z]\d[a-zA-Z] \d[a-zA-Z]\d)$" "g3h6a3" +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" G "<0>bob@somewhere.com" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" G "<0>bob.jones@[1.1.1.1]" +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" G "<0>bob@a.b.c.d.info" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" "bob@com" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" "bob.jones@some.where" # TODO: \w in pattern +#"^([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))$" "bob@1.1.1.123" # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" G "<0>" # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" G "<0>bob A. jones " # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" G "<0>bob A. jones " # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" "ab@cd.ef" # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" ""bob A. jones " # TODO: \w in pattern +#"^(([-\w \.]+)|(""[-\w \.]+"") )?<([\w\-\.]+)@((\[([0-9]{1,3}\.){3}[0-9]{1,3}\])|(([\w\-]+\.)+)([a-zA-Z]{2,4}))>$" "bob A. jones " # TODO: \w in pattern +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" G "<0>SW112LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" G "<0>SW11 2LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" G "<0>CR05LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" "12CR0LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" "12CR 0LE" +"^[A-Za-z]{1,2}[0-9A-Za-z]{1,2}[ ]?[0-9]{0,1}[A-Za-z]{2}$" "SWLE05" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>2099-12-31T23:59:59" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>2002/02/09 16:30:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>2000-01-01T00:00:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" "2000-13-31T00:00:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" "2002/02/33 24:00:00" +"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9]):([0-5][0-9])" "2000-01-01 60:00:00" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" G "<0>6011567812345678" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" G "<0>6011 5678 1234 5678" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" G "<0>6011-5678-1234-5678" +"^((?:4\d{3})|(?:5[1-5]\d{2})|(?:6011)|(?:3[68]\d{2})|(?:30[012345]\d))[ -]?(\d{4})[ -]?(\d{4})[ -]?(\d{4}|3[4,7]\d{13})$" "1234567890123456" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" G "<0>01/01/2001" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" G "<0>02/29/2002" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" G "<0>12/31/2002" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" "1/1/02" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" "02/30/2002" +"^((((0[13578])|(1[02]))[\/]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\/]?(([0-2][0-9])|(30)))|(02[\/]?[0-2][0-9]))[\/]?\d{4}$" "1/25/2002" +#"^(?=[^\&])(?:(?[^:/?#]+):)?(?://(?[^/?#]*))?(?[^?#]*)(?:\?(?[^#]*))?(?:#(?.*))?" G "<0>http://regexlib.com/REDetails.aspx?regexp_id=x#Details" # out of context, can't work stand-alone +#"^(?=[^\&])(?:(?[^:/?#]+):)?(?://(?[^/?#]*))?(?[^?#]*)(?:\?(?[^#]*))?(?:#(?.*))?" "&" # out of context, can't work stand-alone +"^[-+]?\d+(\.\d+)?$" G "<0>123" +"^[-+]?\d+(\.\d+)?$" G "<0>-123.45" +"^[-+]?\d+(\.\d+)?$" G "<0>+123.56" +"^[-+]?\d+(\.\d+)?$" "123x" +"^[-+]?\d+(\.\d+)?$" ".123" +"^[-+]?\d+(\.\d+)?$" "-123." +"^(\d{4}[- ]){3}\d{4}|\d{16}$" G "<0>1234-1234-1234-1234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" G "<0>1234 1234 1234 1234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" G "<0>1234123412341234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" "Visa" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" "1234" +"^(\d{4}[- ]){3}\d{4}|\d{16}$" "123-1234-12345" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" G "<0>6011-1111-1111-1111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" G "<0>5423-1111-1111-1111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" G "<0>341111111111111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" "4111-111-111-111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" "3411-1111-1111-111" +"^((4\d{3})|(5[1-5]\d{2})|(6011))-?\d{4}-?\d{4}-?\d{4}|3[4,7]\d{13}$" "Visa" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" G "<0>4D28C5AD-6482-41CD-B84E-4573F384BB5C" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" G "<0>B1E1282C-A35C-4D5A-BF8B-7A3A51D9E388" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" G "91036A4A-A0F4-43F0-8CD" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" "{B1E1282C-A35C-4D3A-BF8B-7A3A51D9E388}" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" "AAAAAAAAAAAAAAAAA" +"^[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}$" "B;E1282C-A35C-4D3A-BF8B-7A3A51D9E38" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" G "<0>4111-1234-1234-1234" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" G "<0>6011123412341234" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" G "<0>3711-123456-12345" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" "1234567890123456" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" "4111-123-1234-1234" +"(^(4|5)\d{3}-?\d{4}-?\d{4}-?\d{4}|(4|5)\d{15})|(^(6011)-?\d{4}-?\d{4}-?\d{4}|(6011)-?\d{12})|(^((3\d{3}))-\d{6}-\d{5}|^((3\d{14})))" "412-1234-1234-1234" +#'\[link="(?((.|\n)*?))"\](?((.|\n)*?))\[\/link\]' G '<0>[link="http://www.yahoo.com"]Yahoo[/link]' #named capture +#'\[link="(?((.|\n)*?))"\](?((.|\n)*?))\[\/link\]' "[link]http://www.yahoo.com[/link]" #named capture +#'\[link="(?((.|\n)*?))"\](?((.|\n)*?))\[\/link\]' "[link=http://www.yahoo.com]Yahoo[/link]" #named capture +"^[a-zA-Z0-9]+$" G "<0>10a" +"^[a-zA-Z0-9]+$" G "<0>ABC" +"^[a-zA-Z0-9]+$" G "<0>A3fg" +"^[a-zA-Z0-9]+$" "45.3" +"^[a-zA-Z0-9]+$" "this or that" +"^[a-zA-Z0-9]+$" "$23" +"((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}" G "<0>(123) 456-7890" +"((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}" G "<0>123-456-7890" +"((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}" "1234567890" +"^[a-zA-Z]\w{3,14}$" G "<0>abcd" +"^[a-zA-Z]\w{3,14}$" G "<0>aBc45DSD_sdf" +"^[a-zA-Z]\w{3,14}$" G "<0>password" +"^[a-zA-Z]\w{3,14}$" "afv" +"^[a-zA-Z]\w{3,14}$" "1234" +"^[a-zA-Z]\w{3,14}$" "reallylongpassword" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" G "<0>G1 1AA " +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" G "<0>GIR 0AA" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" G "<0>SW1 1ZZ" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" "BT01 3RT" +"^[A-Z]{1,2}[1-9][0-9]?[A-Z]? [0-9][A-Z]{2,}|GIR 0AA$" "G111 1AA" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" G "<0>03-6106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" G "<0>036106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" G "<0>02-5523344" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" "00-6106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" "03-0106666" +"^0[23489]{1}(\-)?[^0\D]{1}\d{6}$" "02-55812346" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" G "<0>050-346634" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" G "<0>058633633" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" G "<0>064-228226" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" "059-336622" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" "064-022663" +"^0(5[012345678]|6[47]){1}(\-)?[^0\D]{1}\d{5}$" "0545454545" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" G "<0>AA11 1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" G "<0>AA1A 1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" G "<0>A11-1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" "111 AAA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" "1AAA 1AA" +"^([A-Z]{1,2}[0-9]{1,2}|[A-Z]{3}|[A-Z]{1,2}[0-9][A-Z])( |-)[0-9][A-Z]{2}" "A1AA 1AA" +"@{2}((\S)+)@{2}" G "<0>@@test@@" +"@{2}((\S)+)@{2}" G "<0>@@name@@" +"@{2}((\S)+)@{2}" G "<0>@@2342@@" +"@{2}((\S)+)@{2}" "@test@" +"@{2}((\S)+)@{2}" "@@na me@@" +"@{2}((\S)+)@{2}" "@@ name@@" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" G "<0>00:00" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" G "<0>13:59" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" G "<0>23:59" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" "24:00" +"([0-1][0-9]|2[0-3]):[0-5][0-9]" "23:60" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" G "<0>23" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" G "<0>-17.e23" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" G "<0>+.23e+2" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" "+.e2" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" "23.17.5" +"^[+-]?([0-9]*\.?[0-9]+|[0-9]+\.?[0-9]*)([eE][+-]?[0-9]+)?$" "10e2.0" +"^([1-zA-Z0-1@.\s ]{1,255})$" G "<0>email@email.com" +"^([1-zA-Z0-1@.\s ]{1,255})$" G "<0>My Name" +"^([1-zA-Z0-1@.\s ]{1,255})$" G "<0>asdf12df" +"^([1-zA-Z0-1@.\s ]{1,255})$" "‘,\*&$<>" +"^([1-zA-Z0-1@.\s ]{1,255})$" "1001' string" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" G "<0>12/2002" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" G "<0>11/1900" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" G "<0>02/1977" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" "1/1977" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" "00/000" +"^((0[1-9])|(1[0-2]))\/(\d{4})$" "15/2002" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" G "<0>(0 34 56) 34 56 67" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" G "<0>(03 45) 5 67 67" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" G "<0>(0 45) 2 33 45-45" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" "(2345) 34 34" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" "(0 56) 456 456" +"^\(\d{1,2}(\s\d{1,2}){1,2}\)\s(\d{1,2}(\s\d{1,2}){1,2})((-(\d{1,4})){0,1})$" "(3 45) 2 34-45678" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" G "<0>Genesis 3:3-4,6" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" G "<0>II Sam 2:11,2" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" G "<0>2 Tim 3:16" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" "Genesis chap 3, verse 3" +"(?:\d|I{1,3})?\s?\w{2,}\.?\s*\d{1,}\:\d{1,}-?,?\d{0,2}(?:,\d{0,2}){0,2}" "2nd Samuel 2" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" G "<0>[IMG]http://bleh.jpg[/IMG]" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" G "<0>[ImG]bleh[/imG]" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" G "<0>[img]ftp://login:pass@bleh.gif[/img]" +"(\[[Ii][Mm][Gg]\])(\S+?)(\[\/[Ii][Mm][Gg]\])" '' +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" G "<0>10/03/1979" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" G "<0>1-1-02" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" G "<0>01.1.2003" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" "10/03/197" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" "01-02-003" +"^([0-9]{1,2})[./-]+([0-9]{1,2})[./-]+([0-9]{2}|[0-9]{4})$" "01 02 03" +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" G "<0>12345" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" G "<0>12345-6789" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" "00000" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" "00000-0000" # No Conditionals? +#"^(?(^00000(|-0000))|(\d{5}(|-\d{4})))$" "a4650-465s" # No Conditionals? +"^((0?[1-9])|((1|2)[0-9])|30|31)$" G "<0>01" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" G "<0>12" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" G "<0>31" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" "123" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" "32" +"^((0?[1-9])|((1|2)[0-9])|30|31)$" "abc" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" G "<0>1.222.333.1234" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" G "<0>1-223-123-1232" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" G "<0>12223334444" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" "1.1.123123.123" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" "12-1322-112-31" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?(\d{3}([\s\-./\\])?\d{4}|[a-zA-Z0-9]{7})$" "11231321131" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" G "<0>DN3 6GB" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" G "<0>SW42 4RG" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" G "<0>GIR 0AA" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" "SEW4 5TY" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" "AA2C 4FG" +"^([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)$" "AA2 4CV" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" G "<0>asD1" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" G "<0>asDF1234" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" G "<0>ASPgo123" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" "asdf" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" "1234" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$" "ASDF12345" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" G "<0>1.222.333.1234" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" G "<0>1-223-123-1232" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" G "<0>1-888-425-DELL" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" "1.1.123123.123" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" "12-1322-112-31" +"^([0-1]([\s\-./\\])?)?(\(?[2-9]\d{2}\)?|[2-9]\d{3})([\s\-./\\])?([0-9]{3}([\s\-./\\])?[0-9]{4}|[a-zA-Z0-9]{7}|([0-9]{3}[-][a-zA-Z0-9]{4}))" "1-800-CALL-DEL" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" G "<0>09:00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" G "<0>9:00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" G "<0>11:35" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" "13:00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" "9.00" +"^(([0]?[1-9]|1[0-2])(:)([0-5][0-9]))$" "6:60" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" G "<0>1" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" G "<0>108" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" G "<0>255" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" "01" +"^([1-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])$" "256" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" G "<0>01/01/2001" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" G "<0>1/01/2001" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" G "<0>2002" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" "2/30/2002" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" "13/23/2002" +"^((((0[13578])|([13578])|(1[02]))[\/](([1-9])|([0-2][0-9])|(3[01])))|(((0[469])|([469])|(11))[\/](([1-9])|([0-2][0-9])|(30)))|((2|02)[\/](([1-9])|([0-2][0-9]))))[\/]\d{4}$|^\d{4}$" "12345" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" G "<0>SP939393H" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" G "<0>PX123456D" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" G "<0>SW355667G" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" "12SP9393H" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" "S3P93930D" +"^[A-Za-z]{2}[0-9]{6}[A-Za-z]{1}$" "11223344SP00ddSS" +"(^0[78][2347][0-9]{7})" G "<0>0834128458" +"(^0[78][2347][0-9]{7})" G "<0>0749526308" +"(^0[78][2347][0-9]{7})" "0861212308" +"(^0[78][2347][0-9]{7})" "0892549851" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" G "<0>C1406HHA" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" G "<0>A4126AAB" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" G "<0>c1406hha" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" "c1406HHA" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" "4126" +"^([A-HJ-TP-Z]{1}\d{4}[A-Z]{3}|[a-z]{1}\d{4}[a-hj-tp-z]{3})$" "C1406hha" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" G "<0>66.129.71.120" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" G "<0>207.46.230.218" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" G "<0>64.58.76.225" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "10.0.5.4" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "192.168.0.1" +"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "my ip address" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.com" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo-foo.com.au" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.foo.info" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@.com" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@foo..com" +"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@me@.com" +"/\*[\d\D]*?\*/" G "<0>/* my comment */" +"/\*[\d\D]*?\*/" G "<0>/* my multiline comment */" +"/\*[\d\D]*?\*/" G "<0>/* my nested comment */" +"/\*[\d\D]*?\*/" "*/ anything here /*" +"/\*[\d\D]*?\*/" "anything between 2 separate comments" +"/\*[\d\D]*?\*/" "\* *\\" +"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my comment */" +"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my multiline comment */" +"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my nested comment */" +"/\*[\p{N}\P{N}]*?\*/" "*/ anything here /*" +"/\*[\p{N}\P{N}]*?\*/" "anything between 2 separate comments" +"/\*[\p{N}\P{N}]*?\*/" "\* *\\" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" G "<0>1/31/2002" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" G "<0>04-30-02" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" G "<0>12-01/2002" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" "2/31/2002" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" "13/0/02" +"((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))" "Jan 1, 2001" +'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>blah@[10.0.0.1]" +'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>a@b.c" +'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' "non@match@." +"^\d{9}[\d|X]$" G "<0>1234123412" +"^\d{9}[\d|X]$" G "<0>123412341X" +"^\d{9}[\d|X]$" "not an isbn" +"^\d{9}(\d|X)$" G "<0>1234123412" +"^\d{9}(\d|X)$" G "<0>123412341X" +"^\d{9}(\d|X)$" "not an isbn" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" G "<0>01/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" G "<0>1/1/1999" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" G "<0>10/20/2080" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" "13/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" "1/1/1800" +"^(([1-9])|(0[1-9])|(1[0-2]))\/(([0-9])|([0-2][0-9])|(3[0-1]))\/(([0-9][0-9])|([1-2][0,9][0-9][0-9]))$" "10/32/2080" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" G "<0>0.25" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" G "<0>.75" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" G "<0>123.50" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" ".77" +"^\d*\.?((25)|(50)|(5)|(75)|(0)|(00))?$" "1.435" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" G "<0>12345" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" G "<0>932 68" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" G "<0>S-621 46" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" "5367" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" "425611" +"^(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}$" "31 545" +"^\d{5}(-\d{4})?$" G "<0>48222" +"^\d{5}(-\d{4})?$" G "<0>48222-1746" +"^\d{5}(-\d{4})?$" "4632" +"^\d{5}(-\d{4})?$" "Blake" +"^\d{5}(-\d{4})?$" "37333-32" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' G "<0>test.txt" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' G "<0>test.jpg.txt" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' G "<0>a&b c.bmp" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' "CON" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' ".pdf" +'^(?!^(PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(\..+)?$)[^\x00-\x1f\\?*:\";|/]+$' "test:2.pdf" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>1'235.140" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>1'222'333.120" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>456" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "1234.500" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "78'45.123" +"^(\d{1,3}'(\d{3}')*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "123,0012" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" G "<0>T2p 3c7" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" G "<0>T3P3c7" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" G "<0>T2P 3C7" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" "123456" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" "3C7T2P" +"^[a-zA-Z][0-9][a-zA-Z]\s?[0-9][a-zA-Z][0-9]$" "11T21RWW" +"^\$[0-9]+(\.[0-9][0-9])?$" G "<0>$1.50" +"^\$[0-9]+(\.[0-9][0-9])?$" G "<0>$49" +"^\$[0-9]+(\.[0-9][0-9])?$" G "<0>$0.50" +"^\$[0-9]+(\.[0-9][0-9])?$" "1.5" +"^\$[0-9]+(\.[0-9][0-9])?$" "$1.333" +"^\$[0-9]+(\.[0-9][0-9])?$" "this $5.12 fails" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" G "<0>217.6.9.89" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" G "<0>0.0.0.0" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" G "<0>255.255.255.255" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" "256.0.0.0" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" "0978.3.3.3" +"\b((25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\.){3}(25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)\b" "65.4t.54.3" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" G "<0>http://www.aspemporium.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" G "<0>mailto:dominionx@hotmail.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" G "<0>ftp://ftp.test.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" "www.aspemporium.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" "dominionx@hotmail.com" +"((mailto\:|(news|(ht|f)tp(s?))\://){1}\S+)" "bloggs" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" G "<0>(12) 123 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" G "<0>(01512) 123 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" G "<0>(0xx12) 1234 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "12 123 1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123/1234" +"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123 12345" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob-smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob.smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob_smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "-smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" ".smith@foo.com" +"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "smith@foo_com" +"^(?=.*\d).{4,8}$" G "<0>1234" +"^(?=.*\d).{4,8}$" G "<0>asdf1234" +"^(?=.*\d).{4,8}$" G "<0>asp123" +"^(?=.*\d).{4,8}$" "asdf" +"^(?=.*\d).{4,8}$" "asdf12345" +"^(?=.*\d).{4,8}$" "password" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" G "<0>user name" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" G "<0>user#name" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" G "<0>....." +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" "User_Name1" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" "username@foo.com" +"[^A-Za-z0-9_@\.]|@{2,}|\.{5,}" "user.name@mail.foo.com" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" G "<0>12,654" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" G "<0>1,987" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" "128,2" +"^100$|^[0-9]{1,2}$|^[0-9]{1,2}\,[0-9]{1,3}$" "12," +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" G "<0>https://www.restrictd.com/~myhome/" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" "http://www.krumedia.com." +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" "(http://www.krumedia.com)" +"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*[^\.\,\)\(\s]$" "http://www.krumedia.com," +"(\d{1,3},(\d{3},)*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>2&651.50" +"(\d{1,3},(\d{3},)*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" G "<0>987.895" +"(\d{1,3},(\d{3},)*\d{3}(\.\d{1,3})?|\d{1,3}(\.\d{3})?)$" "25$%787*" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" G "<0>$1,456,983.00" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" G "<0>$1,700.07" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" G "<0>$68,944.23" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" "$20,86.93" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" "$1098.84" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9]?)?$" "$150." +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" G "<0>$28,009,987.88" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" G "<0>$23,099.05" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" G "<0>$.88" +"\$[0-9]?[0-9]?[0-9]?((\,[0-9][0-9][0-9])*)?(\.[0-9][0-9])?$" "$234,5.99" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" G "<0>29/02/2004 20:15:27" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" G "<0>29/2/04 8:9:5" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" G "<0>31/3/2004 9:20:17" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" "29/02/2003 20:15:15" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" "2/29/04 20:15:15" +"^((((31\/(0?[13578]|1[02]))|((29|30)\/(0?[1,3-9]|1[0-2])))\/(1[6-9]|[2-9]\d)?\d{2})|(29\/0?2\/(((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00))))|(0?[1-9]|1\d|2[0-8])\/((0?[1-9])|(1[0-2]))\/((1[6-9]|[2-9]\d)?\d{2})) (20|21|22|23|[0-1]?\d):[0-5]?\d:[0-5]?\d$" "31/3/4 9:20:17" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" G "<0>something@someserver.com" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" G "<0>firstname.lastname@mailserver.domain.com" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" G "<0>username-something@some-server.nl" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" "username@someserver.domain.c" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" "somename@server.domain-com" +"^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$" "someone@something.se_eo" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" G "<0>8am" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" G "<0>8 am" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" G "<0>8:00 am" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" "8a" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" "8 a" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)|(^([0-9]|[1][0-9]|[2][0-3])(\s{0,1})(AM|PM|am|pm|aM|Am|pM|Pm{2,2})$)" "8:00 a" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" G "<0>55(21)123-4567" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" G "<0>(11)1234-5678" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" G "<0>55(71)4562-2234" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" "3434-3432" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" "4(23)232-3232" +"^([0-9]{2})?(\([0-9]{2})\)([0-9]{3}|[0-9]{4})-[0-9]{4}$" "55(2)232-232" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" G "<0>1:01 AM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" G "<0>23:52:01" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" G "<0>03.24.36 AM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" "19:31 AM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" "9:9 PM" +"^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$" "25:60:61" +"^\d{0,2}(\.\d{1,2})?$" G "<0>99.99" +"^\d{0,2}(\.\d{1,2})?$" G "<0>99" +"^\d{0,2}(\.\d{1,2})?$" G "<0>.99" +"^\d{0,2}(\.\d{1,2})?$" "999.999" +"^\d{0,2}(\.\d{1,2})?$" "999" +"^\d{0,2}(\.\d{1,2})?$" ".999" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" G "<0>1agdA*$#" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" G "<0>1agdA*$#" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" G "<0>1agdA*$#" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" "wyrn%@*&$# f" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" "mbndkfh782" +"^(?=.*\d)(?=.*[a-z])(?=.*[A-Z])(?!.*\s).{4,8}$" "BNfhjdhfjd&*)%#$)" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" G "<0>freshmeat.net" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" G "<0>123.com" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" G "<0>TempLate-toolkKt.orG" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" "-dog.com" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" "?boy.net" +"^([a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\.)+([a-zA-Z0-9]{3,5})$" "this.domain" +"^[^']*$" G "<0>asljas" +"^[^']*$" G "<0>%/&89uhuhadjkh" +"^[^']*$" G '<0>"hi there!"' +"^[^']*$" "'hi there!'" +"^[^']*$" "It's 9 o'clock" +"^[^']*$" "'''''" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" G "<0>((24,((1,2,3),(3,4,5))))" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" G "<0>((1,((2,3,4),(4,5,6),(96,34,26))),(12,((1,3,4),(4,5,6),(7,8,9))))" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" G "<0>()" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" "(24,((1,2,3),(3,4,5)))" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" "( )" +"(^\(\)$|^\(((\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\),)*(\([0-9]+,(\((\([0-9]+,[0-9]+,[0-9]+\),)*(\([0-9]+,[0-9]+,[0-9]+\)){1}\))+\)){1}\)))$" "((23,(12,3,4),(4,5,6)))" +"^[a-zA-Z0-9\s .\-_']+$" G "<0>dony d'gsa" +"^[a-zA-Z0-9\s .\-_']+$" "^[a-zA-Z0-9\s.\-_']+$" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" G "<0>example@example.com" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" G "<0>foo@bar.info" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" G "<0>blah@127.0.0.1" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" "broken@@example.com" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" "foo@bar.infp" +"^[_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.(([0-9]{1,3})|([a-zA-Z]{2,3})|(aero|coop|info|museum|name))$" "blah@.nospam.biz" +"^\d{5}(-\d{3})?$" G "<0>13165-000" +"^\d{5}(-\d{3})?$" G "<0>38175-000" +"^\d{5}(-\d{3})?$" G "<0>81470-276" +"^\d{5}(-\d{3})?$" "13165-00" +"^\d{5}(-\d{3})?$" "38175-abc" +"^\d{5}(-\d{3})?$" "81470-2763" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" G "<0>$0.84" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" G "<0>$123458" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" G "<0>$1,234,567.89" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$12,3456.01" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "12345" +"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$1.234" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>C:\\temp\\this allows spaces\\web.config" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>\\\\Andromeda\\share\\file name.123" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "tz:\temp\ fi*le?na:m.doc" +"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "\\Andromeda\share\filename.a" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" G "<0>10:35" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" G "<0>9:20" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" G "<0>23" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" "24:00" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" "20 PM" +"(^([0-9]|[0-1][0-9]|[2][0-3]):([0-5][0-9])$)|(^([0-9]|[1][0-9]|[2][0-3])$)" "20:15 PM" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" G "<0>$3,023,123.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" G "<0>9,876,453" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" G "<0>123456.78" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" "4,33,234.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" "$1.234" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(\.[0-9][0-9])?$" "abc" +"^\$?\d+(\.(\d{2}))?$" G "<0>$2.43" +"^\$?\d+(\.(\d{2}))?$" G "<0>2.02" +"^\$?\d+(\.(\d{2}))?$" G "<0>$2112" +"^\$?\d+(\.(\d{2}))?$" "2.1" +"^\$?\d+(\.(\d{2}))?$" "$.14" +"^\$?\d+(\.(\d{2}))?$" "$2,222.12" +/("[^"]*")|('[^\r]*)(\r\n)?/ G '<0>"my string"' +/("[^"]*")|('[^\r]*)(\r\n)?/ G '<0>"a string with \u0027 in it"' +/("[^"]*")|('[^\r]*)(\r\n)?/ G "<0>' comment" +/("[^"]*")|('[^\r]*)(\r\n)?/ /asd "/ +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" G "<0>BFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" G "<0>BFDB4d31-3e35-4dab-afca-5e6e5c8f61ea" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "qqqBFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E-4DAB-AFCA-5E6E5C8F61EA" +"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E35-4DAB-AF" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>12.345-678" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>23.345-123" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>99.999" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "41222-222" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "3.444-233" +"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "43.324444" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>12.345-678" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>23.345-123" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>99.999" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "41222-222" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "3.444-233" +"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "43.324444" +#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\file.txt" # TODO: debug +#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\folder\sub folder\file.txt" # TODO: debug +#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>\\network\folder\file.txt" # TODO: debug +"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:" +"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:\file.xls" +"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "folder.txt" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>my.domain.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>regexlib.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>big-reg.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" ".mydomain.com" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "regexlib.comm" +"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "-bigreg.com" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" G "<0>0001-12-31" +"^\d{4}[\-\/\s ]?((((0[13578])|(1[02]))[\-\/\s ]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s ]?(([0-2][0-9])|(30)))|(02[\-\/\s ]?[0-2][0-9]))$" G "<0>9999 09 30" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" G "<0>2002/03/03" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" "0001\\02\\30" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" "9999.15.01" +"^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$" "2002/3/3" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" G "<0>http://psychopop.org" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" G "<0>http://www.edsroom.com/newUser.asp" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" G "<0>http://unpleasant.jarrin.net/markov/inde" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" "ftp://psychopop.org" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" "http://www.edsroom/" +"^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$" "http://un/pleasant.jarrin.net/markov/index.asp" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" G "<0>1145" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" G "<0>933" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" G "<0> 801" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" "0000" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" "1330" +"^( [1-9]|[1-9]|0[1-9]|10|11|12)[0-5]\d$" "8:30" +"^\d{1,2}\/\d{2,4}$" G "<0>9/02" +"^\d{1,2}\/\d{2,4}$" G "<0>09/2002" +"^\d{1,2}\/\d{2,4}$" G "<0>09/02" +"^\d{1,2}\/\d{2,4}$" "Fall 2002" +"^\d{1,2}\/\d{2,4}$" "Sept 2002" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" G "<0>01/01/2001" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" G "<0>02/30/2001" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" G "<0>12/31/2002" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" "1/1/02" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" "1/1/2002" +"^(|(0[1-9])|(1[0-2]))\/((0[1-9])|(1\d)|(2\d)|(3[0-1]))\/((\d{4}))$" "1/25/2002" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" G "<0>15615552323" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" G "<0>1-561-555-1212" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" G "<0>5613333" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "1-555-5555" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "15553333" +"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "0-561-555-1212" +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0>' +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0>" # TODO: \w in pattern +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' '' # TODO: \w in pattern +'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' "The dirty brown fox stank like" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00 AM" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>12:00 PM" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00am" +"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" "24:00" +"^\d*$" G "<0>123" +"^\d*$" G "<0>000" +"^\d*$" G "<0>43" +"^\d*$" "asbc" +"^\d*$" "-34" +"^\d*$" "3.1415" +"^[-+]?\d*$" G "<0>123" +"^[-+]?\d*$" G "<0>-123" +"^[-+]?\d*$" G "<0>+123" +"^[-+]?\d*$" "abc" +"^[-+]?\d*$" "3.14159" +"^[-+]?\d*$" "-3.14159" +"^\d*\.?\d*$" G "<0>123" +"^\d*\.?\d*$" G "<0>3.14159" +"^\d*\.?\d*$" G "<0>.234" +"^\d*\.?\d*$" "abc" +"^\d*\.?\d*$" "-3.14159" +"^\d*\.?\d*$" "3.4.2" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" G "<0>44240" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" G "<0>44240-5555" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" G "<0>T2P 3C7" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" "44240ddd" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" "t44240-55" +"^((\d{5}-\d{4})|(\d{5})|([A-Z]\d[A-Z]\s\d[A-Z]\d))$" "t2p3c7" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" G "<0>(910)456-7890" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" G "<0>(910)456-8970 x12" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" G "<0>(910)456-8970 1211" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" "(910) 156-7890" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" "(910) 056-7890" +"^[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}$" "(910) 556-7890 x" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" G "<0>31.01.2002" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" G "<0>29.2.2004" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" G "<0>09.02.2005" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" "31.11.2002" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" "29.2.2002" +"^((0?[1-9]|[12][1-9]|3[01])\.(0?[13578]|1[02])\.20[0-9]{2}|(0?[1-9]|[12][1-9]|30)\.(0?[13456789]|1[012])\.20[0-9]{2}|(0?[1-9]|1[1-9]|2[0-8])\.(0?[123456789]|1[012])\.20[0-9]{2}|(0?[1-9]|[12][1-9])\.(0?[123456789]|1[012])\.20(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96))$" "33.06.2000" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" G "<0>12/31/2003" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" G "<0>01/01/1900" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" G "<0>11/31/2002" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" "1/1/2002" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" "01/01/02" +"^(0[1-9]|1[0-2])\/((0[1-9]|2\d)|3[0-1])\/(19\d\d|200[0-3])$" "01/01/2004" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" G "<0>3/3/2003" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" G "<0>3/3/2002 3:33 pm" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" G "<0>3/3/2003 3:33:33 am" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" "13/1/2002" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" "3/3/2002 3:33" +"^((((([13578])|(1[0-2]))[\-\/\s]?(([1-9])|([1-2][0-9])|(3[01])))|((([469])|(11))[\-\/\s]?(([1-9])|([1-2][0-9])|(30)))|(2[\-\/\s]?(([1-9])|([1-2][0-9]))))[\-\/\s]?\d{4})(\s((([1-9])|(1[02]))\:([0-5][0-9])((\s)|(\:([0-5][0-9])\s))([AM|PM|am|pm]{2,2})))?$" "31/3/2002" +"([a-zA-Z]:(\\w+)*\\[a-zA-Z0_9]+)?.xls" G "<0>E:\DyAGT\SD01A_specV2.xls" +"([a-zA-Z]:(\\w+)*\\[a-zA-Z0_9]+)?.xls" "E:\DyAGT\SD01A_specV2.txt" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" G "<0>02/29/2084" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" G "<0>01/31/2000" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" G "<0>11/30/2000" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" "02/29/2083" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" "11/31/2000" +"(((0[13578]|10|12)([-./])(0[1-9]|[12][0-9]|3[01])([-./])(\d{4}))|((0[469]|11)([-./])([0][1-9]|[12][0-9]|30)([-./])(\d{4}))|((2)([-./])(0[1-9]|1[0-9]|2[0-8])([-./])(\d{4}))|((2)(\.|-|\/)(29)([-./])([02468][048]00))|((2)([-./])(29)([-./])([13579][26]00))|((2)([-./])(29)([-./])([0-9][0-9][0][48]))|((2)([-./])(29)([-./])([0-9][0-9][2468][048]))|((2)([-./])(29)([-./])([0-9][0-9][13579][26])))" "01/32/2000" +"^[a-zA-Z0-9\s .\-]+$" G "<0>2222 Mock St." # TODO: \s in patterns not implemented +"^[a-zA-Z0-9\s .\-]+$" G "<0>1 A St." +"^[a-zA-Z0-9\s .\-]+$" G "<0>555-1212" +"^[a-zA-Z0-9\s.\-]+$" "[A Street]" +"^[a-zA-Z0-9\s.\-]+$" "(3 A St.)" +"^[a-zA-Z0-9\s.\-]+$" "{34 C Ave.}" +"^[a-zA-Z0-9\s.\-]+$" "Last.*?(\d+.?\d*)" +"^[a-zA-Z0-9\s .\-]+$" G " Last1-(123)-123-1234" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" G "<0>123 123 1234" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" G "<0>1-800-ALPHNUM" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" "1.123.123.1234" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" "(123)-1234-123" +"^([0-9]( |-)?)?(\(?[0-9]{3}\)?|[0-9]{3})( |-)?([0-9]{3}( |-)?[0-9]{4}|[a-zA-Z0-9]{7})$" "123-1234" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" G "<0>02:04" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" G "<0>16:56" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" G "<0>23:59" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" "02:00 PM" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" "PM2:00" +"^([0-1][0-9]|[2][0-3]):([0-5][0-9])$" "24:00" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" G "<0>01/01/1990" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" G "<0>12/12/9999" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" G "<0>3/28/2001" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" "3-8-01" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" "13/32/1001" +"^[0,1]?\d{1}\/(([0-2]?\d{1})|([3][0,1]{1}))\/(([1]{1}[9]{1}[9]{1}\d{1})|([2-9]{1}\d{3}))$" "03/32/1989" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" G "<0>1.2123644567" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" G "<0>0-234.567/8912" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" G "<0>1-(212)-123 4567" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" "0-212364345" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" "1212-364,4321" +"((\(\d{3}\)?)|(\d{3}))([\s \-./]?)(\d{3})([\s \-./]?)(\d{4})" "0212\345/6789" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" G "<0>000000 000000000000" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" G "<0>000000-000000000000" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" G "<0>000000000000000000" +"^([0-9]{6}[\s \-]{1}[0-9]{12}|[0-9]{18})$" "000000_000000000000" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" G "<0>01/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" G "<0>1/1/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" G "<0>01/1/01" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" "13/01/2001" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" "1/2/100" +"^(([1-9])|(0[1-9])|(1[0-2]))\/((0[1-9])|([1-31]))\/((\d{2})|(\d{4}))$" "09/32/2001" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" G "<0>$3,023,123.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" G "<0>9,876,453" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" G "<0>123456.78" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" "4,33,234.34" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" "$1.234" +"^\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?$" "abc" +"^\d{5}$|^\d{5}-\d{4}$" G "<0>55555-5555" +"^\d{5}$|^\d{5}-\d{4}$" G "<0>34564-3342" +"^\d{5}$|^\d{5}-\d{4}$" G "<0>90210" +"^\d{5}$|^\d{5}-\d{4}$" "434454444" +"^\d{5}$|^\d{5}-\d{4}$" "645-32-2345" +"^\d{5}$|^\d{5}-\d{4}$" "abc" +"^\d{3}-\d{2}-\d{4}$" G "<0>333-22-4444" +"^\d{3}-\d{2}-\d{4}$" G "<0>123-45-6789" +"^\d{3}-\d{2}-\d{4}$" "123456789" +"^\d{3}-\d{2}-\d{4}$" "SSN" +"^[2-9]\d{2}-\d{3}-\d{4}$" G "<0>800-555-5555" +"^[2-9]\d{2}-\d{3}-\d{4}$" G "<0>333-444-5555" +"^[2-9]\d{2}-\d{3}-\d{4}$" G "<0>212-666-1234" +"^[2-9]\d{2}-\d{3}-\d{4}$" "000-000-0000" +"^[2-9]\d{2}-\d{3}-\d{4}$" "123-456-7890" +"^[2-9]\d{2}-\d{3}-\d{4}$" "2126661234" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" G "<0>44240" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" G "<0>44240-5555" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" G "<0>G3H 6A3" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" "Ohio" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" "abc" +"^\d{5}-\d{4}|\d{5}|[A-Z]\d[A-Z] \d[A-Z]\d$" "g3h6a3" +"[0-9]{4}\s*[a-zA-Z]{2}" G "<0>1054 WD" +"[0-9]{4}\s*[a-zA-Z]{2}" G "<0>1054WD" +"[0-9]{4}\s*[a-zA-Z]{2}" G "<0>1054 wd" +"[0-9]{4}\s*[a-zA-Z]{2}" "10543" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" G "<0>0732105432" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" G "<0>1300333444" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" G "<0>131313" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" "32105432" +"(^1300\d{6}$)|(^1800|1900|1902\d{6}$)|(^0[2|3|7|8]{1}[0-9]{8}$)|(^13\d{4}$)|(^04\d{2,3}\d{6}$)" "13000456" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" G "<0>http://207.68.172.254/home.ashx" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" G "<0>ftp://ftp.netscape.com/" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" G "<0>https://www.brinkster.com/login.asp" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" "htp://mistake.com/" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" "http://www_address.com/" +"^((https?|ftp)\://((\[?(\d{1,3}\.){3}\d{1,3}\]?)|(([\-a-zA-Z0-9]+\.)+[a-zA-Z]{2,4}))(\:\d+)?(/[\-a-zA-Z0-9._?,'+\&%$#=~\\]+)*/?)$" "ftp://www.files.com/file with spaces.txt" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" G "<0>2002-11-03" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" G "<0>2007-17-08" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" G "<0>9999-99-99" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" "2002/17/18" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" "2002.18.45" +"([0-9]{4})-([0-9]{1,2})-([0-9]{1,2})" "18.45.2002" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" G "<0>$0,234.50" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" G "<0>0234.5" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" G "<0>0,234." +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" "$1,23,50" +"^\$?(\d{1,3}(\,\d{3})*|(\d+))(\.\d{0,2})?$" "$123.123" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" G "<0>12.345-678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" G "<0>12345-678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" G "<0>12345678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" "12.345678" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" "12345-1" +"(^\d{5}-\d{3}|^\d{2}.\d{3}-\d{3}|\d{8})" "123" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' G "<0>x:\\test\\testing.htm" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' G "<0>x:\\test\\test#$ ing.html" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' G "<0>\\\\test\testing.html" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' "x:\test\test/ing.htm" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' "x:\test\test*.htm" +'^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.htm(l)?$' "\\test?<.htm" +"^[1-9]{1}[0-9]{3}$" G "<0>1234" +"^[1-9]{1}[0-9]{3}$" "123" +"^[1-9]{1}[0-9]{3}$" "123A" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" G "<0>A-1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" G "<0>A 1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" G "<0>A1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" "AA-1234" +"^[A-Z]{1}( |-)?[1-9]{1}[0-9]{3}$" "A12345" +"^(F-)?[0-9]{5}$" G "<0>12345" +"^(F-)?[0-9]{5}$" G "<0>F-12345" +"^(F-)?[0-9]{5}$" "F12345" +"^(F-)?[0-9]{5}$" "F-123456" +"^(F-)?[0-9]{5}$" "123456" +"^(V-|I-)?[0-9]{4}$" G "<0>1234" +"^(V-|I-)?[0-9]{4}$" G "<0>V-1234" +"^(V-|I-)?[0-9]{4}$" "12345" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" G "<0>1234 AB" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" G "<0>1234AB" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" "123AB" +"^[1-9]{1}[0-9]{3} ?[A-Z]{2}$" "1234AAA" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" G "<0>12345" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" G "<0>10234" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" G "<0>01234" +"^([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}$" "00123" +"^(/w|/W|[^<>+?$%\{}\&])+$" G "<0>John Doe Sr." +"^(/w|/W|[^<>+?$%\{}\&])+$" G "<0>100 Elm St., Suite 25" +"^(/w|/W|[^<>+?$%\{}\&])+$" G "<0>Valerie's Gift Shop" +"^(/w|/W|[^<>+?$%\{}\&])+$" "

Hey

" +/<[a-zA-Z][^>]*\son\w+=(\w+|'[^']*'|"[^"]*")[^>]*>/ G '<0>' +/<[a-zA-Z][^>]*\son\w+=(\w+|'[^']*'|"[^"]*")[^>]*>/ '' +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" G "<0>1" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" G "<0>12345.123" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" G "<0>0.5" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" "0" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" "0.0" +"(?!^0*$)(?!^0*\.0*$)^\d{1,5}(\.\d{1,3})?$" "123456.1234" +"^.+@[^\.].*\.[a-z]{2,}$" G "<0>whatever@somewhere.museum" +"^.+@[^\.].*\.[a-z]{2,}$" G "<0>foreignchars@myforeigncharsdomain.nu" +"^.+@[^\.].*\.[a-z]{2,}$" G "<0>me+mysomething@mydomain.com" +"^.+@[^\.].*\.[a-z]{2,}$" "a@b.c" +"^.+@[^\.].*\.[a-z]{2,}$" "me@.my.com" +"^.+@[^\.].*\.[a-z]{2,}$" "a@b.comFOREIGNCHAR" +"^(\d{5}-\d{4}|\d{5})$" G "<0>12345" +"^(\d{5}-\d{4}|\d{5})$" G "<0>12345-1234" +"^(\d{5}-\d{4}|\d{5})$" "12345-12345" +"^(\d{5}-\d{4}|\d{5})$" "123" +"^(\d{5}-\d{4}|\d{5})$" "12345-abcd" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" G "<0>0.0.0.0" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" G "<0>255.255.255.02" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" G "<0>192.168.0.136" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "256.1.3.4" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "023.44.33.22" +"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "10.57.98.23." +"]*[^/])>" G '<0>' +"]*[^/])>" '' +"" G "<0>" +"" G "<0>" +"" "this is a comment" +"" G "<0>" +"" G "<0>" +"" "this is a comment" +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ G "<0>" +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ G '<0>' +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ G "<0>" +/<\u002f?(\w+)(\s+\w+=(\w+|"[^"]*"|'[^']*'))*>/ "No Tag Here ..." +"(\{\\f\d*)\\([^;]+;)" G "<0>{\\f0\\Some Font names here;" +"(\{\\f\d*)\\([^;]+;)" G "<0>{\\f1\\fswiss\\fcharset0\\fprq2{\\*\\panose 020b0604020202020204}Arial;" +"(\{\\f\d*)\\([^;]+;)" G "{\\f" +"(\{\\f\d*)\\([^;]+;)" "{f0fs20 some text}" +#"" G '<0>space' # TODO: Can't quote this pattern with the test syntax! +#"" "this is not a tag" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>12/30/2002" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/12/1998 13:30" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/28/2002 22:35:00" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" "13/30/2002" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" "01/12/1998 24:30" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" "01/28/2002 22:35:64" +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" G "<0>BEGIN:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" G "<0>TEL;WORK;VOICE:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" G "<0>TEL:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" "begin:" #named capture +#"((?(^[A-Z0-9-;=]*:))(?(.*)))" "TEL;PREF;" #named capture +'^]*)>(.*?(?=<\/a>))<\/a>$' G '<0>
my external link' +'^]*)>(.*?(?=<\/a>))<\/a>$' G ']*)>(.*?(?=<\/a>))<\/a>$' 'my internal link' +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" G "<0>12/31/2002" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" G "<0>12/31/2002 08:00" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" G "<0>12/31/2002 08:00 AM" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" "12/31/02" +"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0]\d|[1][0-2])(\:[0-5]\d){1,2})*\s*([aApP][mM]{0,2})?$" "12/31/2002 14:00" +"
(?:\s*([^<]+)
\s*)+
" G "<0>
string1
string2
string3
" +"
(?:\s*([^<]+)
\s*)+
" ".." +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" G "<0>1/2/03" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" G "<0>2/30/1999" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" G "<0>03/04/19" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" "3/4/2020" +"^((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((19)([2-9])(\d{1})|(20)([01])(\d{1})|([8901])(\d{1})))$" "3/4/1919" +']*))*|/?>' G '<0>' +']*))*|/?>' G "<0>" +']*))*|/?>' G "<0>
" +']*))*|/?>' "this is a test..." +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" G "<0>12:00am" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" G "<0>1:00 PM" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" G "<0> 12:59 pm" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" "0:00" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" "0:01 am" +"^ *(1[0-2]|[1-9]):[0-5][0-9] *(a|p|A|P)(m|M) *$" "13:00 pm" +"\({1}[0-9]{3}\){1}\-{1}[0-9]{3}\-{1}[0-9]{4}" G "<0>(111)-111-1111" +"\({1}[0-9]{3}\){1}\-{1}[0-9]{3}\-{1}[0-9]{4}" "11111111111" +"[^abc]" G "<0>def" +"[^abc]" "abc" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" G "<0>01/01/2002 04:42" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" G "<0>5-12-02 04:42 AM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" G "<0>01.01/02 04-42aM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" "01-12-1999 4:50PM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" "01-12-2002 15:10PM" +"^(([0]?[1-9]|[1][0-2])[\/|\-|\.]([0-2]\d|[3][0-1]|[1-9])[\/|\-|\.]([2][0])?\d{2}\s+((([0][0-9]|[1][0-2]|[0-9])[\:|\-|\.]([0-5]\d)\s*([aApP][mM])?)|(([0-1][0-9]|[2][0-3]|[0-9])[\:|\-|\.]([0-5]\d))))$" "01-12-002 8:20PM" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" G "<0>11-02-02" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" G "<0>1-25-2002" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" G "<0>01/25/2002" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" "13-02-02" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" "11.02.02" +"^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$" "11/32/2002" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>09:30:00" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>17:45:20" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" G "<0>23:59:59" +"(([0-1][0-9])|([2][0-3])):([0-5][0-9]):([0-5][0-9])" "24:00:00" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" G "<0>29/02/2000" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" G "<0>31/01/2000" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" G "<0>30-01-2000" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" "29/02/2002" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" "32/01/2002" +"(((0[1-9]|[12][0-9]|3[01])([-./])(0[13578]|10|12)([-./])(\d{4}))|(([0][1-9]|[12][0-9]|30)([-./])(0[469]|11)([-./])(\d{4}))|((0[1-9]|1[0-9]|2[0-8])([-./])(02)([-./])(\d{4}))|((29)(\.|-|\/)(02)([-./])([02468][048]00))|((29)([-./])(02)([-./])([13579][26]00))|((29)([-./])(02)([-./])([0-9][0-9][0][48]))|((29)([-./])(02)([-./])([0-9][0-9][2468][048]))|((29)([-./])(02)([-./])([0-9][0-9][13579][26])))" "10/2/2002" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" G "<0>01 46 70 89 12" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" G "<0>01-46-70-89-12" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" G "<0>0146708912" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" "01-46708912" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" "01 46708912" +"^0[1-6]{1}(([0-9]{2}){4})|((\s[0-9]{2}){4})|((-[0-9]{2}){4})$" "+33235256677" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" G "<0>good.gif" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" G "<0>go d.GIf" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" G "<0>goo_d.jPg" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "junk" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "bad.bad.gif" +"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "slash\gif." +"<[^>\s]*\bauthor\b[^>]*>" G '<0>' +"<[^>\s]*\bauthor\b[^>]*>" G "<0>" +# "<[^>\s]*\bauthor\b[^>]*>" G '<0>' #Debug should work +"<[^> ]*\bauthor\b[^>]*>" G "<0>" +"<[^> ]*\bauthor\b[^>]*>" G '<0>' +"<[^>\s]*\bauthor\b[^>]*>" "" +"<[^>\s]*\bauthor\b[^>]*>" "" +"<[^>\s]*\bauthor\b[^>]*>" "author" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" G "<0>04/2/29" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" G "<0>2002-4-30" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" G "<0>02.10.31" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" "2003/2/29" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" "02.4.31" +"^(?:(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00)))(\/|-|\.)(?:0?2\1(?:29))$)|(?:(?:1[6-9]|[2-9]\d)?\d{2})(\/|-|\.)(?:(?:(?:0?[13578]|1[02])\2(?:31))|(?:(?:0?[1,3-9]|1[0-2])\2(29|30))|(?:(?:0?[1-9])|(?:1[0-2]))\2(?:0?[1-9]|1\d|2[0-8]))$" "00/00/00" +'(\d*)\u0027*-*(\d*)/*(\d*)"' G '<0>5\u0027-3/16"' +'(\d*)\u0027*-*(\d*)/*(\d*)"' G '<0>1\u0027-2"' +'(\d*)\u0027*-*(\d*)/*(\d*)"' G '<0>5/16"' +'(\d*)\u0027*-*(\d*)/*(\d*)"' '1 3/16' +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" G "<0>1" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" G "<0>23" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" G "<0>50" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" "0" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" "111" +"^[1-9]{1}$|^[1-4]{1}[0-9]{1}$|^50$" "xyz" +"^([ \u00c0-\u01ffa-zA-Z'])+$" G "<0>Jon Doe" +"^([ \u00c0-\u01ffa-zA-Z'])+$" G "<0>J\u00f8rn" +"^([ \u00c0-\u01ffa-zA-Z'])+$" G "<0>Mc'Neelan" +"^([ \u00c0-\u01ffa-zA-Z'])+$" "Henry); hacking attempt" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" G "<0>1:00 PM" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" G "<0>6:45 am" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" G "<0>17:30" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" "4:32 am" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" "5:30:00 am" +"^((([0]?[1-9]|1[0-2])(:|\.)(00|15|30|45)?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)(00|15|30|45)?))$" "17:01" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" G "<0>0.050" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" G "<0>5.0000" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" G "<0>5000" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0.0" +"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" ".0" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>Sacramento" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><2>San Francisco" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><3>San Luis Obispo" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanFrancisco" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanLuisObispo" +"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "San francisco" +"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}" +"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0" +"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0" +"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}" +"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0" +"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0" +"^([a-zA-Z0-9@*#]{8,15})$" G "<0>@12X*567" +"^([a-zA-Z0-9@*#]{8,15})$" G "<0>1#Zv96g@*Yfasd4" +"^([a-zA-Z0-9@*#]{8,15})$" G "<0>#67jhgt@erd" +"^([a-zA-Z0-9@*#]{8,15})$" "$12X*567" +"^([a-zA-Z0-9@*#]{8,15})$" "1#Zv_96" +"^([a-zA-Z0-9@*#]{8,15})$" "+678jhgt@erd" +'(("|\u0027)[a-z0-9\/\.\?\=\&]*(\.htm|\.asp|\.php|\.jsp)[a-z0-9\/\.\?\=\&]*("|\u0027))|(href=*?[a-z0-9\/\.\?\=\&"\u0027]*)' G '<0>href="produktsida.asp?kategori2=218"' +'(("|\u0027)[a-z0-9\/\.\?\=\&]*(\.htm|\.asp|\.php|\.jsp)[a-z0-9\/\.\?\=\&]*("|\u0027))|(href=*?[a-z0-9\/\.\?\=\&"\u0027]*)' G '<0>href="NuclearTesting.htm"' +'(("|\u0027)[a-z0-9\/\.\?\=\&]*(\.htm|\.asp|\.php|\.jsp)[a-z0-9\/\.\?\=\&]*("|\u0027))|(href=*?[a-z0-9\/\.\?\=\&"\u0027]*)' 'U Suck' +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" G "<0>05-01-2002" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" G "<0>29-02-2004" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" G "<0>31-12-2002" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" "1-1-02" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" "29-02-2002" +"^(((((0[1-9])|(1\d)|(2[0-8]))-((0[1-9])|(1[0-2])))|((31-((0[13578])|(1[02])))|((29|30)-((0[1,3-9])|(1[0-2])))))-((20[0-9][0-9]))|(29-02-20(([02468][048])|([13579][26]))))$" "31-11-2002" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" G "<0>123456.123456" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" G "<0>123456,123456" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" G "<0>123456" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" "123a.123" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" "123a,123" +"^\d*[0-9](|.\d*[0-9]|,\d*[0-9])?$" "a" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" G "<0>AC" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" G "<0>RJ" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" G "<0>SP" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" "XX" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" "AB" +"^(ac|AC|al|AL|am|AM|ap|AP|ba|BA|ce|CE|df|DF|es|ES|go|GO|ma|MA|mg|MG|ms|MS|mt|MT|pa|PA|pb|PB|pe|PE|pi|PI|pr|PR|rj|RJ|rn|RN|ro|RO|rr|RR|rs|RS|sc|SC|se|SE|sp|SP|to|TO)$" "HJ" +"^[+]?\d*$" G "<0>0123456789" +"^[+]?\d*$" G "<0>1234" +"^[+]?\d*$" G "<0>1" +"^[+]?\d*$" "1.0?&" +"^[+]?\d*$" "a1" +"^[+]?\d*$" "2a-" +#/<[aA][ ]{0,}([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,}>((<(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})>([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})|(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})){1,}/ G "<0>this text is italicized" #TODO: Need infinite loop breaking +#/<[aA][ ]{0,}([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,}>((<(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})>([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})|(([a-zA-Z0-9"'_,.:;!?@$\&()%=\u002f ]|[\-]|[ \f]){0,})){1,}/ "

" #TODO: need infinite loop breaking. +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" G "<0>0:00" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" G "<0>23:00" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" G "<0>00:59" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" "0:0" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" "24:00" +"^([0-1]?[0-9]|[2][0-3]):([0-5][0-9])$" "00:60" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" G "<0>11/03" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" G "<0>01/04" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" "13/03" +"^((0[1-9])|(1[0-2]))\/(\d{2})$" "10/2003" +"]*>[\w|\t|\r|\W]*" G '<0>' +"]*>[\w|\t|\r|\W]*" "--" +"]*>[\w|\t|\r|\W]*" "A-Z][a-z]+" +#"]*>[\w|\t|\r|\W]*" G "<0>strFirstName" # Test Case damaged? +#"]*>[\w|\t|\r|\W]*" G "<0>intAgeInYears" # Test Case damaged? +#"]*>[\w|\t|\r|\W]*" G "<0>Where the Wild Things Are" # Test Case damaged? +"]*>[\w|\t|\r|\W]*" "123" +"]*>[\w|\t|\r|\W]*" "abc" +"]*>[\w|\t|\r|\W]*" "this has no caps in it" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-0.050" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5.000" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" "0" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" "0.0" +"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" ".0" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>2002/02/03" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>2002/02/03 12:12:18" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "2002/02/36" +"^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1]))$|^([2][0]\d{2}\/([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" "02/03/2002" +"^(\d|,)*\.?\d*$" G "<0>1,000" +"^(\d|,)*\.?\d*$" G "<0>3,000.05" +"^(\d|,)*\.?\d*$" G "<0>5,000,000" +"^(\d|,)*\.?\d*$" "abc" +"^(\d|,)*\.?\d*$" "$100,000" +"^(\d|,)*\.?\d*$" "Forty" +"^\d$" G "<0>1" +"^\d$" G "<0>2" +"^\d$" G "<0>3" +"^\d$" "a" +"^\d$" "324" +"^\d$" "num" +"^[0-9]+$" G "<0>1234567890" +"^[0-9]+$" G "<0>1234567890" +"^[0-9]+$" G "<0>1234567890" +"^[0-9]+$" "http://none" +"^[0-9]+$" "http://none" +"^[0-9]+$" "http://none" +"^.{4,8}$" G "<0>asdf" +"^.{4,8}$" G "<0>1234" +"^.{4,8}$" G "<0>asdf1234" +"^.{4,8}$" "asd" +"^.{4,8}$" "123" +"^.{4,8}$" "asdfe12345" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com.au" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.au" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word@" +"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "@word" +"^\d{5}-\d{4}$" G "<0>22222-3333" +"^\d{5}-\d{4}$" G "<0>34545-2367" +"^\d{5}-\d{4}$" G "<0>56334-2343" +"^\d{5}-\d{4}$" "123456789" +"^\d{5}-\d{4}$" "A3B 4C5" +"^\d{5}-\d{4}$" "55335" +"(a|b|c).(a.b)*.b+.c" G "<0>autbfc" +"(a|b|c).(a.b)*.b+.c" "attc" +'"((\\")|[^"(\\")])+"' G '<0>"test"' +'"((\\")|[^"(\\")])+"' G '<0>"escape\"quote"' +'"((\\")|[^"(\\")])+"' G '<0>"\\""' +'"((\\")|[^"(\\")])+"' "test" +'"((\\")|[^"(\\")])+"' '"test' +'"((\\")|[^"(\\")])+"' '""test\\"' +"((0[1-9])|(1[02]))/\d{2}" G "<0>01/00" +"((0[1-9])|(1[02]))/\d{2}" G "<0>12/99" +"((0[1-9])|(1[02]))/\d{2}" "13/00" +"((0[1-9])|(1[02]))/\d{2}" "12/AS" +"^[a-zA-Z]$" G "<0>a" +"^[a-zA-Z]$" G "<0>B" +"^[a-zA-Z]$" G "<0>c" +"^[a-zA-Z]$" "0" +"^[a-zA-Z]$" "&" +"^[a-zA-Z]$" "AbC" +"^[a-zA-Z]+$" G "<0>abc" +"^[a-zA-Z]+$" G "<0>ABC" +"^[a-zA-Z]+$" G "<0>aBcDeF" +"^[a-zA-Z]+$" "abc123" +"^[a-zA-Z]+$" "mr." +"^[a-zA-Z]+$" "a word" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" G "<0>Smith, Ed" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" G "<0>Ed Smith" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" G "<0>aBcDeFgH" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" "a123" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" "AB5" +"^\s*[a-zA-Z,\p{Zs}]+\s*$" "Mr. Ed" +"(\w+?@\w+?\u002E.+)" G "<0>bob@vsnl.com" +"(\w+?@\w+?\u002E.+)" "[AABB]" +"^\d+$" G "<0>123" +"^\d+$" G "<0>10" +"^\d+$" G "<0>54" +"^\d+$" "-54" +"^\d+$" "54.234" +"^\d+$" "abc" +"^(\+|-)?\d+$" G "<0>-34" +"^(\+|-)?\d+$" G "<0>34" +"^(\+|-)?\d+$" G "<0>+5" +"^(\+|-)?\d+$" "abc" +"^(\+|-)?\d+$" "3.1415" +"^(\+|-)?\d+$" "-5.3" +"foo" G "<0>foo" +"foo" "bar" +"^[1-5]$" G "<0>1" +"^[1-5]$" G "<0>3" +"^[1-5]$" G "<0>4" +"^[1-5]$" "6" +"^[1-5]$" "23" +"^[1-5]$" "a" +"^[12345]$" G "<0>1" +"^[12345]$" G "<0>2" +"^[12345]$" G "<0>4" +"^[12345]$" "6" +"^[12345]$" "-1" +"^[12345]$" "abc" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@aol.com" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@wrox.co.uk" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@domain.info" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "a@b" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "notanemail" +"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "joe@@." +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>joe@aol.com" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>ssmith@aspalliance.com" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>a@b.cc" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@123aspx.com" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@web.info" +"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@company.co.uk" +"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>joe@aol.com" +"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>a@b.c" +"[\w-]+@([\w-]+\.)+[\w-]+" "asdf" +"[\w-]+@([\w-]+\.)+[\w-]+" "1234" +"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234-1234-1234-1234" +"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234123412341234" +"\d{4}-?\d{4}-?\d{4}-?\d{4}" "1234123412345" +"^\d{5}$" G "<0>33333" +"^\d{5}$" G "<0>55555" +"^\d{5}$" G "<0>23445" +"^\d{5}$" "abcd" +"^\d{5}$" "1324" +"^\d{5}$" "as;lkjdf" +"(\w+)\s+\1" G "<0>hubba hubba" +"(\w+)\s+\1" G "<0>mandate dated" +"(\w+)\s+\1" G "<0>an annual" +"(\w+)\s+\1" "may day" +"(\w+)\s+\1" "gogo" +"(\w+)\s+\1" "1212" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>3SquareBand.com" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>asp.net" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>army.mil" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "$SquareBand.com" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "asp/dot.net" +"^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" "army.military" + diff --git a/go/mysql/icuregex/testdata/regextst_extended.txt b/go/mysql/icuregex/testdata/regextst_extended.txt new file mode 100644 index 00000000000..841e5e46092 --- /dev/null +++ b/go/mysql/icuregex/testdata/regextst_extended.txt @@ -0,0 +1,126 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2001-2015 International Business Machines +# Corporation and others. All Rights Reserved. +# +# file: +# +# ICU regular expression test cases. +# +# format: one test case per line, +# = [# comment] +# = "" +# = "" +# the quotes on the pattern and match string can be " or ' or / +# = text, with the start and end of each +# capture group tagged with .... The overall match, +# if any, is group 0, as in <0>matched text +# A region can be specified with ... tags. +# Standard ICU unescape will be applied, allowing \u, \U, etc. to appear. +# +# = any combination of +# i case insensitive match +# x free spacing and comments +# s dot-matches-all mode +# m multi-line mode. +# ($ and ^ match at embedded new-lines) +# D Unix Lines mode (only recognize 0x0a as new-line) +# Q UREGEX_LITERAL flag. Entire pattern is literal string. +# v If icu configured without break iteration, this +# regex test pattern should not compile. +# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag +# d dump the compiled pattern +# t trace operation of match engine. +# 2-9 a digit between 2 and 9, specifies the number of +# times to execute find(). The expected results are +# for the last find() in the sequence. +# G Only check match / no match. Do not check capture groups. +# E Pattern compilation error expected +# L Use LookingAt() rather than find() +# M Use matches() rather than find(). +# +# a Use non-Anchoring Bounds. +# b Use Transparent Bounds. +# The a and b options only make a difference if +# a region has been specified in the string. +# z|Z hitEnd was expected(z) or not expected (Z). +# With neither, hitEnd is not checked. +# y|Y Require End expected(y) or not expected (Y). +# +# White space must be present between the flags and the match string. +# + +"[:xdigit:]" " <0>4f" +"\P{XDIGIT}+" "4f<0> " + +"[:blank:]" "<0> 4f" +"\P{BLANK}+" "<0>4f " + +"[:print:]" "<0> 4f\x07" +"\P{PRINT}+" " 4f<0>\x07" + +"\p{Age=1.1}" "<0>4f🥱" +"\p{Age=11}" "4f🥱" +"\p{Age=12}" "4f<0>🥱" + +"\p{Name=LATIN SMALL LETTER B}" "Good<0>bye" + +"\p{Numeric_Value=3}" "Good<0>3ye" +"\p{Numeric_Value=14}" "Good<0>⑭ye" + +"\p{Script_Extensions=Greek}" "Good<0>βye" + +"\p{Bidi_Control}" "Good<0>\u200Eye" +"\p{Bidi_Class=LeftToRight}" "<0>Goodbye" +"\p{Bidi_Class=RightToLeft}" "Goodbye" +"\p{Bidi_Class=LeftToRight}" "؈" +"\p{Bidi_Paired_Bracket_Type=Open}" "Good<0>(ye" + +"\p{Soft_Dotted}" "Good<0>iye" + +"\p{Changes_When_Lowercased}" "<0>Goodbye" +"\p{Changes_When_Titlecased}" "<0>goodbye" +"\p{Changes_When_Uppercased}" "G<0>oodbye" +"\p{Changes_When_CaseMapped}" " <0>Goodbye3" +"\p{Cased}" " <0>Goodbye3" +"\p{CaseIgnorable}" "foo<0>.bar" + +"\p{Indic_Syllabic_Category=Avagraha}" "foo<0>\u09BDbar" +"\p{IndicPositionalCategory=Top_And_Left_And_Right}" "foo<0>\u0B4Cbar" +"\p{VerticalOrientation=U}" "foo<0>\uA015bar" + +"\p{Canonical_Combining_Class=Nukta}" "foo<0>\u093Cbar" +"\p{Lead_Canonical_Combining_Class=Above}" "foo<0>\u0300bar" +"\p{Trail_Canonical_Combining_Class=Above}" "foo<0>\u0300bar" + +"\p{Changes_When_Casefolded}" "<0>\uFB03Goodbye" +"\p{Changes_When_Casefolded}" 2 "\uFB03<0>Goodbye" + +"\p{NFC_Inert}" "foo<0>\uFB03bar" +"\p{NFKC_Inert}" "foo<0>\uFB03bar" +"\P{NFD_Inert}" "foo<0>Àbar" +"\P{NFKD_Inert}" "foo<0>Àbar" + +"\p{NFC_Quick_Check=No}" "foo<0>\u0340bar" +"\p{NFKC_Quick_Check=No}" "foo<0>\u0340bar" +"\p{NFD_Quick_Check=No}" "foo<0>\u00C0bar" +"\p{NFKD_Quick_Check=No}" "foo<0>\u00C0bar" + +"\p{Full_Composition_Exclusion}" "foo<0>\u0374bar" + +"\p{Numeric_Type=Decimal}" "foo<0>3bar" +"\p{Joining_Type=Dual_Joining}" "foo<0>\u0626bar" +"\p{Joining_Group=African_Feh}" "foo<0>\u08BBbar" +"\p{General_Category=Close_Punctuation}" "foo[bar" +"\p{General_Category=Close_Punctuation}" "foo<0>]]bar" +"\p{General_Category=Close_Punctuation}" 2 "foo]<0>]bar" + +"\p{Hangul_Syllable_Type=Not_Applicable}" "<0>f" +"\p{Hangul_Syllable_Type=Leading_Jamo}" "foo<0>\u1100bar" + +"\p{Regional_Indicator=Yes}" "foo<0>\U0001F1E6bar" + +# Currently unsupported property classes below. They require +# significant additional code to support. +"\p{Changes_When_NFKC_Casefolded}" E "foo<0>\uFB03bar" +"\p{Segment_Starter}" E "<0>\uFB03Goodbye" \ No newline at end of file diff --git a/go/mysql/sql_error.go b/go/mysql/sql_error.go index c400de4ef9a..ac988033e3d 100644 --- a/go/mysql/sql_error.go +++ b/go/mysql/sql_error.go @@ -218,6 +218,28 @@ var stateToMysqlCode = map[vterrors.State]mysqlCode{ vterrors.WrongArguments: {num: ERWrongArguments, state: SSUnknownSQLState}, vterrors.UnknownStmtHandler: {num: ERUnknownStmtHandler, state: SSUnknownSQLState}, vterrors.UnknownTimeZone: {num: ERUnknownTimeZone, state: SSUnknownSQLState}, + vterrors.RegexpStringNotTerminated: {num: ERRegexpStringNotTerminated, state: SSUnknownSQLState}, + vterrors.RegexpBufferOverflow: {num: ERRegexpBufferOverflow, state: SSUnknownSQLState}, + vterrors.RegexpIllegalArgument: {num: ERRegexpIllegalArgument, state: SSUnknownSQLState}, + vterrors.RegexpIndexOutOfBounds: {num: ERRegexpIndexOutOfBounds, state: SSUnknownSQLState}, + vterrors.RegexpInternal: {num: ERRegexpInternal, state: SSUnknownSQLState}, + vterrors.RegexpRuleSyntax: {num: ERRegexpRuleSyntax, state: SSUnknownSQLState}, + vterrors.RegexpBadEscapeSequence: {num: ERRegexpBadEscapeSequence, state: SSUnknownSQLState}, + vterrors.RegexpUnimplemented: {num: ERRegexpUnimplemented, state: SSUnknownSQLState}, + vterrors.RegexpMismatchParen: {num: ERRegexpMismatchParen, state: SSUnknownSQLState}, + vterrors.RegexpBadInterval: {num: ERRegexpBadInterval, state: SSUnknownSQLState}, + vterrors.RegexpMaxLtMin: {num: ERRRegexpMaxLtMin, state: SSUnknownSQLState}, + vterrors.RegexpInvalidBackRef: {num: ERRegexpInvalidBackRef, state: SSUnknownSQLState}, + vterrors.RegexpLookBehindLimit: {num: ERRegexpLookBehindLimit, state: SSUnknownSQLState}, + vterrors.RegexpMissingCloseBracket: {num: ERRegexpMissingCloseBracket, state: SSUnknownSQLState}, + vterrors.RegexpInvalidRange: {num: ERRegexpInvalidRange, state: SSUnknownSQLState}, + vterrors.RegexpStackOverflow: {num: ERRegexpStackOverflow, state: SSUnknownSQLState}, + vterrors.RegexpTimeOut: {num: ERRegexpTimeOut, state: SSUnknownSQLState}, + vterrors.RegexpPatternTooBig: {num: ERRegexpPatternTooBig, state: SSUnknownSQLState}, + vterrors.RegexpInvalidFlag: {num: ERRegexpInvalidFlag, state: SSUnknownSQLState}, + vterrors.RegexpInvalidCaptureGroup: {num: ERRegexpInvalidCaptureGroup, state: SSUnknownSQLState}, + vterrors.CharacterSetMismatch: {num: ERCharacterSetMismatch, state: SSUnknownSQLState}, + vterrors.WrongParametersToNativeFct: {num: ERWrongParametersToNativeFct, state: SSUnknownSQLState}, } func getStateToMySQLState(state vterrors.State) mysqlCode { diff --git a/go/vt/vterrors/state.go b/go/vt/vterrors/state.go index d7ed04e1c7b..609ab6fbd1b 100644 --- a/go/vt/vterrors/state.go +++ b/go/vt/vterrors/state.go @@ -88,6 +88,31 @@ const ( // unknown timezone UnknownTimeZone + // regexp errors + RegexpStringNotTerminated + RegexpBufferOverflow + RegexpIllegalArgument + RegexpIndexOutOfBounds + RegexpInternal + RegexpRuleSyntax + RegexpBadEscapeSequence + RegexpUnimplemented + RegexpMismatchParen + RegexpBadInterval + RegexpMaxLtMin + RegexpInvalidBackRef + RegexpLookBehindLimit + RegexpMissingCloseBracket + RegexpInvalidRange + RegexpStackOverflow + RegexpTimeOut + RegexpPatternTooBig + RegexpInvalidCaptureGroup + RegexpInvalidFlag + + CharacterSetMismatch + WrongParametersToNativeFct + // No state should be added below NumOfStates NumOfStates ) diff --git a/go/vt/vtgate/evalengine/cached_size.go b/go/vt/vtgate/evalengine/cached_size.go index c249bf3e86c..ea525e46a25 100644 --- a/go/vt/vtgate/evalengine/cached_size.go +++ b/go/vt/vtgate/evalengine/cached_size.go @@ -1257,6 +1257,54 @@ func (cached *builtinRandomBytes) CachedSize(alloc bool) int64 { size += cached.CallExpr.CachedSize(false) return size } +func (cached *builtinRegexpInstr) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} +func (cached *builtinRegexpLike) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} +func (cached *builtinRegexpReplace) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} +func (cached *builtinRegexpSubstr) CachedSize(alloc bool) int64 { + if cached == nil { + return int64(0) + } + size := int64(0) + if alloc { + size += int64(48) + } + // field CallExpr vitess.io/vitess/go/vt/vtgate/evalengine.CallExpr + size += cached.CallExpr.CachedSize(false) + return size +} func (cached *builtinRepeat) CachedSize(alloc bool) int64 { if cached == nil { return int64(0) diff --git a/go/vt/vtgate/evalengine/compare.go b/go/vt/vtgate/evalengine/compare.go index f2262cf8730..deee5fdb520 100644 --- a/go/vt/vtgate/evalengine/compare.go +++ b/go/vt/vtgate/evalengine/compare.go @@ -137,7 +137,7 @@ func compareStrings(l, r eval) (int, error) { if err != nil { return 0, err } - collation := col.Get() + collation := col.Collation.Get() if collation == nil { panic("unknown collation after coercion") } diff --git a/go/vt/vtgate/evalengine/compiler_asm.go b/go/vt/vtgate/evalengine/compiler_asm.go index 870c32fd767..1267eaf1d1d 100644 --- a/go/vt/vtgate/evalengine/compiler_asm.go +++ b/go/vt/vtgate/evalengine/compiler_asm.go @@ -35,6 +35,8 @@ import ( "github.com/google/uuid" + "vitess.io/vitess/go/mysql/icuregex" + "vitess.io/vitess/go/hack" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/mysql/collations/charset" @@ -3942,10 +3944,6 @@ func (asm *assembler) Fn_YEARWEEK() { }, "FN YEARWEEK DATE(SP-1)") } -func intervalStackOffset(l, i int) int { - return l - i + 1 -} - func (asm *assembler) Interval_i(l int) { asm.adjustStack(-l) asm.emit(func(env *ExpressionEnv) int { @@ -4285,3 +4283,442 @@ func (asm *assembler) Fn_DATEADD_s(unit datetime.IntervalType, sub bool, col col }, "FN DATEADD TEMPORAL(SP-2), INTERVAL(SP-1)") } + +func (asm *assembler) Fn_REGEXP_LIKE(m *icuregex.Matcher, negate bool, c charset.Charset, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + m.Reset(charset.Expand(nil, input.bytes, c)) + + ok, err := m.Find() + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + if negate { + ok = !ok + } + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalBool(ok) + env.vm.sp -= offset + return 1 + }, "FN REGEXP_LIKE VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_LIKE_slow(negate bool, c collations.Charset, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + var err error + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + + if offset > 1 { + fe := env.vm.stack[env.vm.sp-offset+1] + flags, err = regexpFlags(fe, flags, "regexp_like") + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(charset.Expand(nil, input.bytes, c)) + + ok, err := m.Find() + if err != nil { + env.vm.err = err + env.vm.sp-- + return 1 + } + if negate { + ok = !ok + } + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalBool(ok) + env.vm.sp -= offset + return 1 + }, "FN REGEXP_LIKE_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_INSTR(m *icuregex.Matcher, c charset.Charset, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + runes := charset.Expand(nil, input.bytes, c) + + if len(runes) == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + env.vm.sp -= offset + return 1 + } + + pos := int64(1) + if offset > 1 { + pos, env.vm.err = positionInstr(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + returnOpt := int64(0) + if offset > 3 { + returnOpt, env.vm.err = returnOption(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + if !found { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + } else if returnOpt == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.Start()) + pos) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.End()) + pos) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_INSTR VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_INSTR_slow(c collations.Charset, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + + if offset > 4 { + fe := env.vm.stack[env.vm.sp-offset+4] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + runes := charset.Expand(nil, input.bytes, c) + if len(runes) == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + env.vm.sp -= offset + return 1 + } + + pos := int64(1) + if offset > 1 { + pos, env.vm.err = positionInstr(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), int64(len(runes))) + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + returnOpt := int64(0) + if offset > 3 { + returnOpt, env.vm.err = returnOption(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), "regexp_instr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + m := icuregex.NewMatcher(p) + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + if !found { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(0) + } else if returnOpt == 0 { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.Start()) + pos) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalInt64(int64(m.End()) + pos) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_INSTR_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_SUBSTR(m *icuregex.Matcher, merged collations.TypedCollation, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + c := merged.Collation.Get().Charset() + runes := charset.Expand(nil, input.bytes, c) + + pos := int64(1) + if offset > 1 { + limit := int64(len(runes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), limit, "regexp_substr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = nil + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + + if !found { + env.vm.stack[env.vm.sp-offset-1] = nil + } else { + out := runes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] + b := charset.Collapse(nil, out, c) + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalText(b, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_SUBSTR VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_SUBSTR_slow(merged collations.TypedCollation, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + c := merged.Collation.Get().Charset() + runes := charset.Expand(nil, input.bytes, c) + + pos := int64(1) + if offset > 1 { + limit := int64(len(runes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+1].(*evalInt64), limit, "regexp_substr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = nil + env.vm.sp -= offset + return 1 + } + } + + occ := int64(1) + if offset > 2 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), occ) + } + + if offset > 3 { + fe := env.vm.stack[env.vm.sp-offset+3] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_substr") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(runes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, env.vm.err = m.Find() + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if !found { + break + } + } + + if !found { + env.vm.stack[env.vm.sp-offset-1] = nil + } else { + out := runes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] + b := charset.Collapse(nil, out, c) + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalText(b, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_SUBSTR_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_REPLACE(m *icuregex.Matcher, merged collations.TypedCollation, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + repl := env.vm.stack[env.vm.sp-offset+1].(*evalBytes) + + c := merged.Collation.Get().Charset() + inputRunes := charset.Expand(nil, input.bytes, c) + replRunes := charset.Expand(nil, repl.bytes, c) + + pos := int64(1) + if offset > 2 { + limit := int64(len(inputRunes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), limit, "regexp_replace") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + env.vm.sp -= offset + return 1 + } + } + + occ := int64(0) + if offset > 3 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), occ) + } + + m.Reset(inputRunes[pos-1:]) + + b, replaced, err := regexpReplace(m, inputRunes, replRunes, pos, occ, merged.Collation.Get().Charset()) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + if !replaced { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(b, sqltypes.Text, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_REPLACE VARCHAR(SP-2), VARCHAR(SP-1)") +} + +func (asm *assembler) Fn_REGEXP_REPLACE_slow(merged collations.TypedCollation, flags icuregex.RegexpFlag, offset int) { + asm.adjustStack(-offset) + asm.emit(func(env *ExpressionEnv) int { + input := env.vm.stack[env.vm.sp-offset-1].(*evalBytes) + pattern := env.vm.stack[env.vm.sp-offset].(*evalBytes) + repl := env.vm.stack[env.vm.sp-offset+1].(*evalBytes) + + c := merged.Collation.Get().Charset() + inputRunes := charset.Expand(nil, input.bytes, c) + replRunes := charset.Expand(nil, repl.bytes, c) + + pos := int64(1) + if offset > 2 { + limit := int64(len(inputRunes)) + pos, env.vm.err = position(env.vm.stack[env.vm.sp-offset+2].(*evalInt64), limit, "regexp_replace") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + if pos-1 == limit { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + env.vm.sp -= offset + return 1 + } + } + + occ := int64(0) + if offset > 3 { + occ = occurrence(env.vm.stack[env.vm.sp-offset+3].(*evalInt64), 0) + } + + if offset > 4 { + fe := env.vm.stack[env.vm.sp-offset+4] + flags, env.vm.err = regexpFlags(fe, flags, "regexp_replace") + if env.vm.err != nil { + env.vm.sp -= offset + return 1 + } + } + + p, err := compileRegex(pattern, c, flags) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + + m := icuregex.NewMatcher(p) + m.Reset(inputRunes[pos-1:]) + + b, replaced, err := regexpReplace(m, inputRunes, replRunes, pos, occ, merged.Collation.Get().Charset()) + if err != nil { + env.vm.err = err + env.vm.sp -= offset + return 1 + } + if !replaced { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(input.bytes, sqltypes.Text, resultCollation(merged)) + } else { + env.vm.stack[env.vm.sp-offset-1] = env.vm.arena.newEvalRaw(b, sqltypes.Text, resultCollation(merged)) + } + env.vm.sp -= offset + return 1 + }, "FN REGEXP_REPLACE_SLOW VARCHAR(SP-2), VARCHAR(SP-1)") +} diff --git a/go/vt/vtgate/evalengine/compiler_test.go b/go/vt/vtgate/evalengine/compiler_test.go index 92ef9d3d465..1b5ace371c9 100644 --- a/go/vt/vtgate/evalengine/compiler_test.go +++ b/go/vt/vtgate/evalengine/compiler_test.go @@ -444,6 +444,10 @@ func TestCompilerSingle(t *testing.T) { expression: `INTERVAL(0, 0, 0, -1, NULL, NULL, 1)`, result: `INT64(5)`, }, + { + expression: `REGEXP_REPLACE(1234, 12, 6, 1)`, + result: `TEXT("634")`, + }, } for _, tc := range testCases { diff --git a/go/vt/vtgate/evalengine/expr_collate.go b/go/vt/vtgate/evalengine/expr_collate.go index 16fe8351880..2ba2e3dba61 100644 --- a/go/vt/vtgate/evalengine/expr_collate.go +++ b/go/vt/vtgate/evalengine/expr_collate.go @@ -54,6 +54,12 @@ var collationUtf8mb3 = collations.TypedCollation{ Repertoire: collations.RepertoireUnicode, } +var collationRegexpFallback = collations.TypedCollation{ + Collation: collations.CollationLatin1Swedish, + Coercibility: collations.CoerceCoercible, + Repertoire: collations.RepertoireASCII, +} + type ( CollateExpr struct { UnaryExpr @@ -152,16 +158,16 @@ func mergeCollations(c1, c2 collations.TypedCollation, t1, t2 sqltypes.Type) (co }) } -func mergeAndCoerceCollations(left, right eval) (eval, eval, collations.ID, error) { +func mergeAndCoerceCollations(left, right eval) (eval, eval, collations.TypedCollation, error) { lt := left.SQLType() rt := right.SQLType() mc, coerceLeft, coerceRight, err := mergeCollations(evalCollation(left), evalCollation(right), lt, rt) if err != nil { - return nil, nil, 0, err + return nil, nil, collations.TypedCollation{}, err } if coerceLeft == nil && coerceRight == nil { - return left, right, mc.Collation, nil + return left, right, mc, nil } left1 := newEvalRaw(lt, left.(*evalBytes).bytes, mc) @@ -170,16 +176,16 @@ func mergeAndCoerceCollations(left, right eval) (eval, eval, collations.ID, erro if coerceLeft != nil { left1.bytes, err = coerceLeft(nil, left1.bytes) if err != nil { - return nil, nil, 0, err + return nil, nil, collations.TypedCollation{}, err } } if coerceRight != nil { right1.bytes, err = coerceRight(nil, right1.bytes) if err != nil { - return nil, nil, 0, err + return nil, nil, collations.TypedCollation{}, err } } - return left1, right1, mc.Collation, nil + return left1, right1, mc, nil } type collationAggregation struct { diff --git a/go/vt/vtgate/evalengine/expr_compare.go b/go/vt/vtgate/evalengine/expr_compare.go index cef7493e026..3aca0cc1151 100644 --- a/go/vt/vtgate/evalengine/expr_compare.go +++ b/go/vt/vtgate/evalengine/expr_compare.go @@ -558,7 +558,7 @@ func (l *LikeExpr) eval(env *ExpressionEnv) (eval, error) { return nil, err } - var col collations.ID + var col collations.TypedCollation left, right, col, err = mergeAndCoerceCollations(left, right) if err != nil { return nil, err @@ -567,11 +567,11 @@ func (l *LikeExpr) eval(env *ExpressionEnv) (eval, error) { var matched bool switch { case typeIsTextual(left.SQLType()) && typeIsTextual(right.SQLType()): - matched = l.matchWildcard(left.(*evalBytes).bytes, right.(*evalBytes).bytes, col) + matched = l.matchWildcard(left.(*evalBytes).bytes, right.(*evalBytes).bytes, col.Collation) case typeIsTextual(right.SQLType()): - matched = l.matchWildcard(left.ToRawBytes(), right.(*evalBytes).bytes, col) + matched = l.matchWildcard(left.ToRawBytes(), right.(*evalBytes).bytes, col.Collation) case typeIsTextual(left.SQLType()): - matched = l.matchWildcard(left.(*evalBytes).bytes, right.ToRawBytes(), col) + matched = l.matchWildcard(left.(*evalBytes).bytes, right.ToRawBytes(), col.Collation) default: matched = l.matchWildcard(left.ToRawBytes(), right.ToRawBytes(), collations.CollationBinaryID) } diff --git a/go/vt/vtgate/evalengine/fn_regexp.go b/go/vt/vtgate/evalengine/fn_regexp.go new file mode 100644 index 00000000000..5886a5c3765 --- /dev/null +++ b/go/vt/vtgate/evalengine/fn_regexp.go @@ -0,0 +1,1062 @@ +/* +Copyright 2023 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package evalengine + +import ( + "errors" + "strings" + + "vitess.io/vitess/go/mysql/collations" + "vitess.io/vitess/go/mysql/collations/charset" + "vitess.io/vitess/go/mysql/icuregex" + icuerrors "vitess.io/vitess/go/mysql/icuregex/errors" + "vitess.io/vitess/go/sqltypes" + querypb "vitess.io/vitess/go/vt/proto/query" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/vterrors" +) + +func regexpFlags(m eval, flags icuregex.RegexpFlag, f string) (icuregex.RegexpFlag, error) { + switch m := m.(type) { + case *evalBytes: + for _, b := range m.bytes { + switch b { + case 'c': + flags &= ^icuregex.CaseInsensitive + case 'i': + flags |= icuregex.CaseInsensitive + case 'm': + flags |= icuregex.Multiline + case 'n': + flags |= icuregex.DotAll + case 'u': + flags |= icuregex.UnixLines + default: + return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s.", f) + } + } + default: + return flags, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s.", f) + } + + return flags, nil +} + +func occurrence(e *evalInt64, min int64) int64 { + if e.i < min { + return min + } + return e.i +} + +func returnOption(val *evalInt64, f string) (int64, error) { + switch val.i { + case 0, 1: + // Valid return options. + return val.i, nil + } + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongArguments, "Incorrect arguments to %s: return_option must be 1 or 0.", f) +} + +func positionInstr(val *evalInt64, limit int64) (int64, error) { + pos := val.i + if pos < 1 || pos > limit { + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIndexOutOfBounds, "Index out of bounds in regular expression search.") + } + return pos, nil +} + +func position(val *evalInt64, limit int64, f string) (int64, error) { + pos := val.i + if pos < 1 { + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.WrongParametersToNativeFct, "Incorrect parameters in the call to native function '%s'", f) + } + if pos-1 > limit { + return 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIndexOutOfBounds, "Index out of bounds in regular expression search.") + } + return pos, nil +} + +func evalRegexpCollation(input, pat eval, f string) (eval, eval, collations.TypedCollation, icuregex.RegexpFlag, error) { + var typedCol collations.TypedCollation + var err error + + if inputBytes, ok := input.(*evalBytes); ok { + if patBytes, ok := pat.(*evalBytes); ok { + inputCol := inputBytes.col.Collation + patCol := patBytes.col.Collation + if (inputCol == collations.CollationBinaryID && patCol != collations.CollationBinaryID) || + (inputCol != collations.CollationBinaryID && patCol == collations.CollationBinaryID) { + inputColName := inputCol.Get().Name() + patColName := patCol.Get().Name() + return nil, nil, typedCol, 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.CharacterSetMismatch, "Character set '%s' cannot be used in conjunction with '%s' in call to %s.", inputColName, patColName, f) + } + } + } + + input, pat, typedCol, err = mergeAndCoerceCollations(input, pat) + if err != nil { + return nil, nil, collations.TypedCollation{}, 0, err + } + + var flags icuregex.RegexpFlag + var collation = typedCol.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive + } + + return input, pat, typedCol, flags, nil +} + +func compileRegexpCollation(input, pat ctype, f string) (collations.TypedCollation, icuregex.RegexpFlag, error) { + var merged collations.TypedCollation + var err error + + if input.isTextual() && pat.isTextual() { + inputCol := input.Col.Collation + patCol := pat.Col.Collation + if (inputCol == collations.CollationBinaryID && patCol != collations.CollationBinaryID) || + (inputCol != collations.CollationBinaryID && patCol == collations.CollationBinaryID) { + inputColName := inputCol.Get().Name() + patColName := patCol.Get().Name() + return input.Col, 0, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.CharacterSetMismatch, "Character set '%s' cannot be used in conjunction with '%s' in call to %s.", inputColName, patColName, f) + } + } + + if input.Col.Collation != pat.Col.Collation { + merged, _, _, err = mergeCollations(input.Col, pat.Col, input.Type, pat.Type) + } else { + merged = input.Col + } + if err != nil { + return input.Col, 0, err + } + + var flags icuregex.RegexpFlag + var collation = merged.Collation.Get() + if strings.Contains(collation.Name(), "_ci") { + flags |= icuregex.CaseInsensitive + } + return merged, flags, nil +} + +func compileRegex(pat eval, c collations.Charset, flags icuregex.RegexpFlag) (*icuregex.Pattern, error) { + patRunes := charset.Expand(nil, pat.ToRawBytes(), c) + + if len(patRunes) == 0 { + return nil, vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIllegalArgument, "Illegal argument to a regular expression.") + } + + regexp, err := icuregex.Compile(patRunes, flags) + if err == nil { + return regexp, nil + } + + var compileErr *icuregex.CompileError + if errors.Is(err, icuerrors.ErrUnsupported) { + err = vterrors.NewErrorf(vtrpcpb.Code_UNIMPLEMENTED, vterrors.RegexpUnimplemented, err.Error()) + } else if errors.Is(err, icuerrors.ErrIllegalArgument) { + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpIllegalArgument, err.Error()) + } else if errors.As(err, &compileErr) { + switch compileErr.Code { + case icuregex.InternalError: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInternal, compileErr.Error()) + case icuregex.RuleSyntax: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpRuleSyntax, compileErr.Error()) + case icuregex.BadEscapeSequence: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpBadEscapeSequence, compileErr.Error()) + case icuregex.PropertySyntax: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpRuleSyntax, compileErr.Error()) + case icuregex.Unimplemented: + err = vterrors.NewErrorf(vtrpcpb.Code_UNIMPLEMENTED, vterrors.RegexpUnimplemented, compileErr.Error()) + case icuregex.MismatchedParen: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpMismatchParen, compileErr.Error()) + case icuregex.BadInterval: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpBadInterval, compileErr.Error()) + case icuregex.MaxLtMin: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpMaxLtMin, compileErr.Error()) + case icuregex.InvalidBackRef: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidBackRef, compileErr.Error()) + case icuregex.InvalidFlag: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidFlag, compileErr.Error()) + case icuregex.LookBehindLimit: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpLookBehindLimit, compileErr.Error()) + case icuregex.MissingCloseBracket: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpMissingCloseBracket, compileErr.Error()) + case icuregex.InvalidRange: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidRange, compileErr.Error()) + case icuregex.PatternTooBig: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpPatternTooBig, compileErr.Error()) + case icuregex.InvalidCaptureGroupName: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInvalidCaptureGroup, compileErr.Error()) + default: + err = vterrors.NewErrorf(vtrpcpb.Code_INVALID_ARGUMENT, vterrors.RegexpInternal, compileErr.Error()) + } + } + + return nil, err +} + +func compileConstantRegex(c *compiler, args TupleExpr, pat, mt int, cs collations.TypedCollation, flags icuregex.RegexpFlag, f string) (*icuregex.Pattern, error) { + pattern := args[pat] + if !pattern.constant() { + return nil, c.unsupported(pattern) + } + var err error + staticEnv := EmptyExpressionEnv() + pattern, err = simplifyExpr(staticEnv, pattern) + if err != nil { + return nil, err + } + + if len(args) > mt { + fl := args[mt] + if !fl.constant() { + return nil, c.unsupported(fl) + } + fl, err = simplifyExpr(staticEnv, fl) + if err != nil { + return nil, err + } + flags, err = regexpFlags(fl.(*Literal).inner, flags, f) + if err != nil { + return nil, err + } + } + + if pattern.(*Literal).inner == nil { + return nil, c.unsupported(pattern) + } + + innerPat, err := evalToVarchar(pattern.(*Literal).inner, cs.Collation, true) + if err != nil { + return nil, err + } + + return compileRegex(innerPat, cs.Collation.Get().Charset(), flags) +} + +// resultCollation returns the collation to use for the result of a regexp. +// This falls back to latin1_swedish if the input collation is binary. This +// seems to be a side effect of how MySQL also works. Probably due to how it +// is using ICU and converting there. +func resultCollation(in collations.TypedCollation) collations.TypedCollation { + if in.Collation == collationBinary.Collation { + return collationRegexpFallback + } + return in +} + +type builtinRegexpLike struct { + CallExpr + Negate bool +} + +func (r *builtinRegexpLike) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_like") + if err != nil { + return nil, err + } + collation := typedCol.Collation.Get() + + if len(r.Arguments) > 2 { + m, err := r.Arguments[2].eval(env) + if err != nil || m == nil { + return nil, err + } + flags, err = regexpFlags(m, flags, "regexp_like") + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes) + + ok, err := m.Find() + if err != nil { + return nil, err + } + if r.Negate { + ok = !ok + } + return newEvalBool(ok), nil +} + +func (r *builtinRegexpLike) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + var f3 typeFlag + if len(r.Arguments) > 2 { + _, f3 = r.Arguments[2].typeof(env, fields) + } + return sqltypes.Int64, f1 | f2 | f3 | flagIsBoolean +} + +func (r *builtinRegexpLike) compileSlow(c *compiler, input, pat, fl ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_LIKE_slow(r.Negate, merged.Collation.Get().Charset(), flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | fl.Flag | flagIsBoolean}, nil +} + +func (r *builtinRegexpLike) compile(c *compiler) (ctype, error) { + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pat, 1)) + + var f ctype + + if len(r.Arguments) > 2 { + f, err = r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(f, 2)) + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_like") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + p, err := compileConstantRegex(c, r.Arguments, 1, 2, merged, flags, "regexp_like") + if err != nil { + return r.compileSlow(c, input, pat, f, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_LIKE(icuregex.NewMatcher(p), r.Negate, merged.Collation.Get().Charset(), len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | f.Flag | flagIsBoolean}, nil +} + +var _ Expr = (*builtinRegexpLike)(nil) + +type builtinRegexpInstr struct { + CallExpr +} + +func (r *builtinRegexpInstr) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_instr") + if err != nil { + return nil, err + } + + var posExpr eval + if len(r.Arguments) > 2 { + posExpr, err = r.Arguments[2].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + } + + var occExpr eval + if len(r.Arguments) > 3 { + occExpr, err = r.Arguments[3].eval(env) + if err != nil || occExpr == nil { + return nil, err + } + } + + var retExpr eval + if len(r.Arguments) > 4 { + retExpr, err = r.Arguments[4].eval(env) + if err != nil || retExpr == nil { + return nil, err + } + } + + var mtExpr eval + if len(r.Arguments) > 5 { + mtExpr, err = r.Arguments[5].eval(env) + if err != nil || mtExpr == nil { + return nil, err + } + } + + collation := typedCol.Collation.Get() + + pos := int64(1) + occ := int64(1) + returnOpt := int64(0) + + if mtExpr != nil { + flags, err = regexpFlags(mtExpr, flags, "regexp_instr") + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + if len(inputRunes) == 0 { + return newEvalInt64(0), nil + } + + if posExpr != nil { + pos, err = positionInstr(evalToInt64(posExpr), int64(len(inputRunes))) + if err != nil { + return nil, err + } + } + + if occExpr != nil { + occ = occurrence(evalToInt64(occExpr), occ) + } + + if retExpr != nil { + returnOpt, err = returnOption(evalToInt64(retExpr), "regexp_instr") + if err != nil { + return nil, err + } + } + + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, err = m.Find() + if err != nil { + return nil, err + } + if !found { + break + } + } + if !found { + return newEvalInt64(0), nil + } + if returnOpt == 0 { + return newEvalInt64(int64(m.Start()) + pos), nil + } + return newEvalInt64(int64(m.End()) + pos), nil +} + +func (r *builtinRegexpInstr) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + var f3, f4, f5, f6 typeFlag + if len(r.Arguments) > 2 { + _, f3 = r.Arguments[2].typeof(env, fields) + } + if len(r.Arguments) > 3 { + _, f4 = r.Arguments[3].typeof(env, fields) + } + if len(r.Arguments) > 4 { + _, f5 = r.Arguments[4].typeof(env, fields) + } + if len(r.Arguments) > 5 { + _, f6 = r.Arguments[5].typeof(env, fields) + } + return sqltypes.Int64, f1 | f2 | f3 | f4 | f5 | f6 +} + +func (r *builtinRegexpInstr) compileSlow(c *compiler, input, pat, pos, occ, returnOption, matchType ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_INSTR_slow(merged.Collation.Get().Charset(), flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | pos.Flag | occ.Flag | returnOption.Flag | matchType.Flag}, nil +} + +func (r *builtinRegexpInstr) compile(c *compiler) (ctype, error) { + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pat, 1)) + + var pos ctype + if len(r.Arguments) > 2 { + pos, err = r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pos, 2)) + _ = c.compileToInt64(pos, 1) + } + + var occ ctype + if len(r.Arguments) > 3 { + occ, err = r.Arguments[3].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(occ, 3)) + _ = c.compileToInt64(occ, 1) + } + + var returnOpt ctype + if len(r.Arguments) > 4 { + returnOpt, err = r.Arguments[4].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(returnOpt, 4)) + _ = c.compileToInt64(returnOpt, 1) + } + + var matchType ctype + if len(r.Arguments) > 5 { + matchType, err = r.Arguments[5].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(matchType, 5)) + switch { + case matchType.isTextual(): + default: + c.asm.Convert_xb(1, sqltypes.VarBinary, 0, false) + } + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_instr") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + p, err := compileConstantRegex(c, r.Arguments, 1, 5, merged, flags, "regexp_instr") + if err != nil { + return r.compileSlow(c, input, pat, pos, occ, returnOpt, matchType, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_INSTR(icuregex.NewMatcher(p), merged.Collation.Get().Charset(), len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | flagIsBoolean}, nil +} + +var _ Expr = (*builtinRegexpInstr)(nil) + +type builtinRegexpSubstr struct { + CallExpr +} + +func (r *builtinRegexpSubstr) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_substr") + if err != nil { + return nil, err + } + + var posExpr eval + // For some reason this gets checked before NULL checks of the other values + if len(r.Arguments) > 2 { + posExpr, err = r.Arguments[2].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + } + + var occExpr eval + if len(r.Arguments) > 3 { + occExpr, err = r.Arguments[3].eval(env) + if err != nil || occExpr == nil { + return nil, err + } + } + + var mtExpr eval + if len(r.Arguments) > 4 { + mtExpr, err = r.Arguments[4].eval(env) + if err != nil || mtExpr == nil { + return nil, err + } + } + + collation := typedCol.Collation.Get() + pos := int64(1) + occ := int64(1) + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + + if posExpr != nil { + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes)), "regexp_substr") + if err != nil { + return nil, err + } + + } + + if occExpr != nil { + occ = occurrence(evalToInt64(occExpr), occ) + } + + if mtExpr != nil { + flags, err = regexpFlags(mtExpr, flags, "regexp_substr") + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes[pos-1:]) + + found := false + for i := int64(0); i < occ; i++ { + found, err = m.Find() + if err != nil { + return nil, err + } + if !found { + break + } + } + if !found { + return nil, nil + } + out := inputRunes[int64(m.Start())+pos-1 : int64(m.End())+pos-1] + b := charset.Collapse(nil, out, collation.Charset()) + return newEvalText(b, resultCollation(typedCol)), nil +} + +func (r *builtinRegexpSubstr) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + var f3, f4, f5 typeFlag + if len(r.Arguments) > 2 { + _, f3 = r.Arguments[2].typeof(env, fields) + } + if len(r.Arguments) > 3 { + _, f4 = r.Arguments[3].typeof(env, fields) + } + if len(r.Arguments) > 4 { + _, f5 = r.Arguments[4].typeof(env, fields) + } + return sqltypes.VarChar, f1 | f2 | f3 | f4 | f5 +} + +func (r *builtinRegexpSubstr) compileSlow(c *compiler, input, pat, pos, occ, matchType ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_SUBSTR_slow(merged, flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil +} + +func (r *builtinRegexpSubstr) compile(c *compiler) (ctype, error) { + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pat, 1)) + + var pos ctype + if len(r.Arguments) > 2 { + pos, err = r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pos, 2)) + _ = c.compileToInt64(pos, 1) + } + + var occ ctype + if len(r.Arguments) > 3 { + occ, err = r.Arguments[3].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(occ, 3)) + _ = c.compileToInt64(occ, 1) + } + + var matchType ctype + if len(r.Arguments) > 4 { + matchType, err = r.Arguments[4].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(matchType, 4)) + switch { + case matchType.isTextual(): + default: + c.asm.Convert_xb(1, sqltypes.VarBinary, 0, false) + } + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_substr") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + p, err := compileConstantRegex(c, r.Arguments, 1, 4, merged, flags, "regexp_substr") + if err != nil { + return r.compileSlow(c, input, pat, pos, occ, matchType, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_SUBSTR(icuregex.NewMatcher(p), merged, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil +} + +var _ Expr = (*builtinRegexpSubstr)(nil) + +type builtinRegexpReplace struct { + CallExpr +} + +func regexpReplace(m *icuregex.Matcher, inputRunes, replRunes []rune, pos, occ int64, c collations.Charset) ([]byte, bool, error) { + var err error + found := false + if occ > 0 { + for i := int64(0); i < occ; i++ { + found, err = m.Find() + if err != nil { + return nil, false, err + } + if !found { + break + } + } + if !found { + return nil, false, nil + } + + out := append(inputRunes[:int64(m.Start())+pos-1], replRunes...) + out = append(out, inputRunes[int64(m.End())+pos-1:]...) + return charset.Collapse(nil, out, c), true, nil + } + + found, err = m.Find() + if err != nil { + return nil, false, err + } + + if !found { + return nil, false, nil + } + + start := int64(m.Start()) + pos - 1 + out := append(inputRunes[:start], replRunes...) + end := int64(m.End()) + pos - 1 + for { + found, err = m.Find() + if err != nil { + return nil, false, err + } + if !found { + break + } + nextStart := int64(m.Start()) + pos - 1 + out = append(out, inputRunes[end:nextStart]...) + out = append(out, replRunes...) + end = int64(m.End()) + pos - 1 + } + + out = append(out, inputRunes[end:]...) + return charset.Collapse(nil, out, c), true, nil +} + +func (r *builtinRegexpReplace) eval(env *ExpressionEnv) (eval, error) { + input, err := r.Arguments[0].eval(env) + if err != nil || input == nil { + return nil, err + } + + pat, err := r.Arguments[1].eval(env) + if err != nil || pat == nil { + return nil, err + } + + replArg, err := r.Arguments[2].eval(env) + if err != nil || replArg == nil { + return nil, err + } + + input, pat, typedCol, flags, err := evalRegexpCollation(input, pat, "regexp_replace") + if err != nil { + return nil, err + } + + var posExpr eval + // For some reason this gets checked before NULL checks of the other values + if len(r.Arguments) > 3 { + posExpr, err = r.Arguments[3].eval(env) + if err != nil || posExpr == nil { + return nil, err + } + } + + var occExpr eval + if len(r.Arguments) > 4 { + occExpr, err = r.Arguments[4].eval(env) + if err != nil || occExpr == nil { + return nil, err + } + } + + var mtExpr eval + if len(r.Arguments) > 5 { + mtExpr, err = r.Arguments[5].eval(env) + if err != nil || mtExpr == nil { + return nil, err + } + } + + collation := typedCol.Collation.Get() + + repl, ok := replArg.(*evalBytes) + if !ok { + repl, err = evalToVarchar(replArg, typedCol.Collation, true) + if err != nil { + return nil, err + } + } + pos := int64(1) + occ := int64(0) + inputRunes := charset.Expand(nil, input.ToRawBytes(), collation.Charset()) + replRunes := charset.Expand(nil, repl.ToRawBytes(), repl.col.Collation.Get().Charset()) + + if posExpr != nil { + pos, err = position(evalToInt64(posExpr), int64(len(inputRunes)), "regexp_replace") + if err != nil { + return nil, err + } + } + + if occExpr != nil { + occ = occurrence(evalToInt64(occExpr), occ) + } + + if mtExpr != nil { + flags, err = regexpFlags(mtExpr, flags, "regexp_replace") + if err != nil { + return nil, err + } + } + + regexp, err := compileRegex(pat, collation.Charset(), flags) + if err != nil { + return nil, err + } + + m := icuregex.NewMatcher(regexp) + m.Reset(inputRunes[pos-1:]) + + bytes, replaced, err := regexpReplace(m, inputRunes, replRunes, pos, occ, collation.Charset()) + if err != nil { + return nil, err + } + if !replaced { + return newEvalRaw(sqltypes.Text, input.ToRawBytes(), resultCollation(typedCol)), nil + } + return newEvalRaw(sqltypes.Text, bytes, resultCollation(typedCol)), nil +} + +func (r *builtinRegexpReplace) typeof(env *ExpressionEnv, fields []*querypb.Field) (sqltypes.Type, typeFlag) { + _, f1 := r.Arguments[0].typeof(env, fields) + _, f2 := r.Arguments[1].typeof(env, fields) + _, f3 := r.Arguments[2].typeof(env, fields) + var f4, f5, f6 typeFlag + if len(r.Arguments) > 3 { + _, f4 = r.Arguments[3].typeof(env, fields) + } + if len(r.Arguments) > 4 { + _, f5 = r.Arguments[4].typeof(env, fields) + } + if len(r.Arguments) > 5 { + _, f6 = r.Arguments[5].typeof(env, fields) + } + return sqltypes.Text, f1 | f2 | f3 | f4 | f5 | f6 +} + +func (r *builtinRegexpReplace) compileSlow(c *compiler, input, pat, repl, pos, occ, matchType ctype, merged collations.TypedCollation, flags icuregex.RegexpFlag, skips ...*jump) (ctype, error) { + if !pat.isTextual() || pat.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-1, sqltypes.VarChar, merged.Collation) + } + + c.asm.Fn_REGEXP_REPLACE_slow(merged, flags, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | repl.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil +} + +func (r *builtinRegexpReplace) compile(c *compiler) (ctype, error) { + input, err := r.Arguments[0].compile(c) + if err != nil { + return ctype{}, err + } + var skips []*jump + skips = append(skips, c.compileNullCheckArg(input, 0)) + + pat, err := r.Arguments[1].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pat, 1)) + + repl, err := r.Arguments[2].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(repl, 2)) + + var pos ctype + if len(r.Arguments) > 3 { + pos, err = r.Arguments[3].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(pos, 3)) + _ = c.compileToInt64(pos, 1) + } + + var occ ctype + if len(r.Arguments) > 4 { + occ, err = r.Arguments[4].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(occ, 4)) + _ = c.compileToInt64(occ, 1) + } + + var matchType ctype + if len(r.Arguments) > 5 { + matchType, err = r.Arguments[5].compile(c) + if err != nil { + return ctype{}, err + } + skips = append(skips, c.compileNullCheckArg(matchType, 5)) + switch { + case matchType.isTextual(): + default: + c.asm.Convert_xb(1, sqltypes.VarBinary, 0, false) + } + } + + merged, flags, err := compileRegexpCollation(input, pat, "regexp_replace") + if err != nil { + return ctype{}, err + } + + if !input.isTextual() || input.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments), sqltypes.VarChar, merged.Collation) + } + + if !repl.isTextual() || repl.Col.Collation != merged.Collation { + c.asm.Convert_xce(len(r.Arguments)-2, sqltypes.VarChar, merged.Collation) + } + + // We optimize for the case where the pattern is a constant. If not, + // we fall back to the slow path. + p, err := compileConstantRegex(c, r.Arguments, 1, 5, merged, flags, "regexp_replace") + if err != nil { + return r.compileSlow(c, input, pat, repl, pos, occ, matchType, merged, flags, skips...) + } + + c.asm.Fn_REGEXP_REPLACE(icuregex.NewMatcher(p), merged, len(r.Arguments)-1) + c.asm.jumpDestination(skips...) + + return ctype{Type: sqltypes.Int64, Col: collationNumeric, Flag: input.Flag | pat.Flag | repl.Flag | pos.Flag | occ.Flag | matchType.Flag}, nil +} + +var _ Expr = (*builtinRegexpReplace)(nil) diff --git a/go/vt/vtgate/evalengine/integration/fuzz_test.go b/go/vt/vtgate/evalengine/integration/fuzz_test.go index 24cd2733fd4..563bb323244 100644 --- a/go/vt/vtgate/evalengine/integration/fuzz_test.go +++ b/go/vt/vtgate/evalengine/integration/fuzz_test.go @@ -98,6 +98,11 @@ var ( regexp.MustCompile(`Invalid JSON text in argument (\d+) to function (\w+): (.*?)`), regexp.MustCompile(`Illegal mix of collations`), regexp.MustCompile(`Incorrect (DATE|DATETIME) value`), + regexp.MustCompile(`Syntax error in regular expression`), + regexp.MustCompile(`The regular expression contains an unclosed bracket expression`), + regexp.MustCompile(`Illegal argument to a regular expression`), + regexp.MustCompile(`Incorrect arguments to regexp_substr`), + regexp.MustCompile(`Incorrect arguments to regexp_replace`), } ) diff --git a/go/vt/vtgate/evalengine/mysql_test.go b/go/vt/vtgate/evalengine/mysql_test.go index 18802cfb8dc..987ad906b88 100644 --- a/go/vt/vtgate/evalengine/mysql_test.go +++ b/go/vt/vtgate/evalengine/mysql_test.go @@ -147,6 +147,6 @@ func TestMySQLGolden(t *testing.T) { func TestDebug1(t *testing.T) { // Debug - eval, err := testSingle(t, `SELECT DATE_SUB(TIMESTAMP'2025-01-01 00:00:00', INTERVAL '1.999999' year_month)`) + eval, err := testSingle(t, `SELECT _latin1 0xFF regexp _latin1 '[[:lower:]]' COLLATE latin1_bin`) t.Logf("eval=%s err=%v coll=%s", eval.String(), err, eval.Collation().Get().Name()) } diff --git a/go/vt/vtgate/evalengine/testcases/cases.go b/go/vt/vtgate/evalengine/testcases/cases.go index b72c5dae816..d6e692b1a99 100644 --- a/go/vt/vtgate/evalengine/testcases/cases.go +++ b/go/vt/vtgate/evalengine/testcases/cases.go @@ -151,6 +151,10 @@ var Cases = []TestCase{ {Run: FnUUID}, {Run: FnUUIDToBin}, {Run: DateMath}, + {Run: RegexpLike}, + {Run: RegexpInstr}, + {Run: RegexpSubstr}, + {Run: RegexpReplace}, } func JSONPathOperations(yield Query) { @@ -1898,3 +1902,287 @@ func DateMath(yield Query) { } } } + +func RegexpLike(yield Query) { + mysqlDocSamples := []string{ + `'Michael!' REGEXP '.*'`, + `'Michael!' RLIKE '.*'`, + `'Michael!' NOT REGEXP '.*'`, + `'Michael!' NOT RLIKE '.*'`, + `'new*\n*line' REGEXP 'new\\*.\\*line'`, + `'a' REGEXP '^[a-d]'`, + `REGEXP_LIKE('CamelCase', 'CAMELCASE')`, + `REGEXP_LIKE('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, + `REGEXP_LIKE('abc', 'ABC'`, + `REGEXP_LIKE('abc', 'ABC', 'c')`, + `REGEXP_LIKE(1234, 12)`, + `REGEXP_LIKE(1234, 12, 'c')`, + `' ' REGEXP '[[:blank:]]'`, + `'\t' REGEXP '[[:blank:]]'`, + `' ' REGEXP '[[:space:]]'`, + `'\t' REGEXP '[[:space:]]'`, + `_latin1 0xFF regexp _latin1 '[[:lower:]]' COLLATE latin1_bin`, + `_koi8r 0xFF regexp _koi8r '[[:lower:]]' COLLATE koi8r_bin`, + `_latin1 0xFF regexp _latin1 '[[:upper:]]' COLLATE latin1_bin`, + `_koi8r 0xFF regexp _koi8r '[[:upper:]]' COLLATE koi8r_bin`, + `_latin1 0xF7 regexp _latin1 '[[:alpha:]]'`, + `_koi8r 0xF7 regexp _koi8r '[[:alpha:]]'`, + `_latin1'a' regexp _latin1'A' collate latin1_general_ci`, + `_latin1'a' regexp _latin1'A' collate latin1_bin`, + + `_latin1 'ÿ' regexp _utf8mb4 'ÿ'`, + `_utf8mb4 'ÿ' regexp _latin1 'ÿ'`, + `convert('ÿ' as char character set latin1) regexp _utf8mb4 'ÿ'`, + `_utf8mb4 'ÿ' regexp convert('ÿ' as char character set latin1)`, + + `'a' regexp '\\p{alphabetic}'`, + `'a' regexp '\\P{alphabetic}'`, + `'👌🏾regexp '\\p{Emoji}\\p{Emoji_modifier}'`, + `'a' regexp '\\p{Lowercase_letter}'`, + `'a' regexp '\\p{Uppercase_letter}'`, + `'A' regexp '\\p{Lowercase_letter}'`, + `'A' regexp '\\p{Uppercase_letter}'`, + `'a' collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}'`, + `'A' collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}'`, + `'a' collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}'`, + `'A' collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}'`, + `0xff REGEXP 0xff`, + `0xff REGEXP 0xfe`, + `cast(time '12:34:58' as json) REGEXP 0xff`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } + + for _, i := range regexInputs { + for _, p := range regexInputs { + yield(fmt.Sprintf("%s REGEXP %s", i, p), nil) + yield(fmt.Sprintf("%s NOT REGEXP %s", i, p), nil) + for _, m := range regexMatchStrings { + yield(fmt.Sprintf("REGEXP_LIKE(%s, %s, %s)", i, p, m), nil) + } + } + } +} + +func RegexpInstr(yield Query) { + mysqlDocSamples := []string{ + `REGEXP_INSTR('Michael!', '.*')`, + `REGEXP_INSTR('new*\n*line', 'new\\*.\\*line')`, + `REGEXP_INSTR('a', '^[a-d]')`, + `REGEXP_INSTR('CamelCase', 'CAMELCASE')`, + `REGEXP_INSTR('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, + `REGEXP_INSTR('abc', 'ABC'`, + `REGEXP_INSTR('abc', 'ABC', 'c')`, + `REGEXP_INSTR('0', '0', 1, 0)`, + `REGEXP_INSTR(' ', '[[:blank:]]')`, + `REGEXP_INSTR('\t', '[[:blank:]]')`, + `REGEXP_INSTR(' ', '[[:space:]]')`, + `REGEXP_INSTR('\t', '[[:space:]]')`, + `REGEXP_INSTR(_latin1 0xFF, _latin1 '[[:lower:]]' COLLATE latin1_bin)`, + `REGEXP_INSTR(_koi8r 0xFF, _koi8r '[[:lower:]]' COLLATE koi8r_bin)`, + `REGEXP_INSTR(_latin1 0xFF, _latin1 '[[:upper:]]' COLLATE latin1_bin)`, + `REGEXP_INSTR(_koi8r 0xFF, _koi8r '[[:upper:]]' COLLATE koi8r_bin)`, + `REGEXP_INSTR(_latin1 0xF7, _latin1 '[[:alpha:]]')`, + `REGEXP_INSTR(_koi8r 0xF7, _koi8r '[[:alpha:]]')`, + `REGEXP_INSTR(_latin1'a', _latin1'A' collate latin1_general_ci)`, + `REGEXP_INSTR(_latin1'a', _latin1'A' collate latin1_bin)`, + `REGEXP_INSTR('a', '\\p{alphabetic}')`, + `REGEXP_INSTR('a', '\\P{alphabetic}')`, + `REGEXP_INSTR('👌🏾, '\\p{Emoji}\\p{Emoji_modifier}')`, + `REGEXP_INSTR('a', '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('a', '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('A', '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('A', '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_INSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_INSTR('dog cat dog', 'dog')`, + `REGEXP_INSTR('dog cat dog', 'dog', 2)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 1)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 0)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 1, 1)`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 1, 1, 'i')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 1, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 2)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 2, 0)`, + `REGEXP_INSTR('dog cat dog', 'dog', 1, 2, 1)`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, 'i')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, 'c')`, + `REGEXP_INSTR('aa aaa aaaa', 'a{2}')`, + `REGEXP_INSTR('aa aaa aaaa', 'a{4}')`, + `REGEXP_INSTR(1234, 12)`, + `REGEXP_INSTR(1234, 12, 1)`, + `REGEXP_INSTR(1234, 12, 100)`, + `REGEXP_INSTR(1234, 12, 1, 1)`, + `REGEXP_INSTR(1234, 12, 1, 1, 1)`, + `REGEXP_INSTR(1234, 12, 1, 1, 1, 'c')`, + `REGEXP_INSTR('', ' ', 1000)`, + `REGEXP_INSTR(' ', ' ', 1000)`, + `REGEXP_INSTR(NULL, 'DOG', 1, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', NULL, 1, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', NULL, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, NULL, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, NULL, 'c')`, + `REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, NULL)`, + + `REGEXP_INSTR('dog cat dog', NULL, 1, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', NULL, 2, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', 1, NULL, 1, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', 1, 2, NULL, 'c')`, + `REGEXP_INSTR('dog cat dog', _latin1 'DOG', 1, 2, 1, NULL)`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} + +func RegexpSubstr(yield Query) { + mysqlDocSamples := []string{ + `REGEXP_SUBSTR('Michael!', '.*')`, + `REGEXP_SUBSTR('new*\n*line', 'new\\*.\\*line')`, + `REGEXP_SUBSTR('a', '^[a-d]')`, + `REGEXP_SUBSTR('CamelCase', 'CAMELCASE')`, + `REGEXP_SUBSTR('CamelCase', 'CAMELCASE' COLLATE utf8mb4_0900_as_cs)`, + `REGEXP_SUBSTR('abc', 'ABC'`, + `REGEXP_SUBSTR(' ', '[[:blank:]]')`, + `REGEXP_SUBSTR('\t', '[[:blank:]]')`, + `REGEXP_SUBSTR(' ', '[[:space:]]')`, + `REGEXP_SUBSTR('\t', '[[:space:]]')`, + `REGEXP_SUBSTR(_latin1'a', _latin1'A' collate latin1_general_ci)`, + `REGEXP_SUBSTR(_latin1'a', _latin1'A' collate latin1_bin)`, + `REGEXP_SUBSTR('a', '\\p{alphabetic}')`, + `REGEXP_SUBSTR('a', '\\P{alphabetic}')`, + `REGEXP_SUBSTR('👌🏾, '\\p{Emoji}\\p{Emoji_modifier}')`, + `REGEXP_SUBSTR('a', '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('a', '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('A', '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('A', '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Lowercase_letter}')`, + `REGEXP_SUBSTR('a', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('A', collate utf8mb4_0900_as_cs regexp '\\p{Uppercase_letter}')`, + `REGEXP_SUBSTR('dog cat dog', 'dog')`, + `REGEXP_SUBSTR('dog cat dog', 'dog', 2)`, + `REGEXP_SUBSTR('dog cat dog', 'dog', 1, 1)`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, 'c')`, + `REGEXP_SUBSTR('dog cat dog', 'dog', 1, 2)`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 2, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 2, 'c')`, + `REGEXP_SUBSTR('aa aaa aaaa', 'a{2}')`, + `REGEXP_SUBSTR('aa aaa aaaa', 'a{4}')`, + `REGEXP_SUBSTR(1234, 12)`, + `REGEXP_SUBSTR(1234, 12, 1)`, + `REGEXP_SUBSTR(1234, 12, 100)`, + `REGEXP_SUBSTR(1234, 12, 1, 1)`, + `REGEXP_SUBSTR(1234, 12, 1, 1, 'c')`, + + `REGEXP_SUBSTR(NULL, 'DOG', 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', NULL, 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', NULL, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, NULL, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, NULL)`, + + `REGEXP_SUBSTR(NULL, '[', 1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', '[', NULL, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', '[', 1, NULL, 'i')`, + `REGEXP_SUBSTR('dog cat dog', '[', 1, 1, NULL)`, + + `REGEXP_SUBSTR('dog cat dog', 'DOG', 0, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', -1, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 100, 1, 'i')`, + `REGEXP_SUBSTR('dog cat dog', 'DOG', 1, 1, 0)`, + + `REGEXP_SUBSTR(' ', ' ', 1)`, + `REGEXP_SUBSTR(' ', ' ', 2)`, + `REGEXP_SUBSTR(' ', ' ', 3)`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} + +func RegexpReplace(yield Query) { + mysqlDocSamples := []string{ + `REGEXP_REPLACE('a b c', 'b', 'X')`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 3)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2, 3)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 3, 3)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 0)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 1)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 2)`, + `REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 4, 3)`, + `REGEXP_REPLACE('a', '\\p{Lowercase_letter}', 'X')`, + `REGEXP_REPLACE('a', '\\p{Uppercase_letter}', 'X')`, + `REGEXP_REPLACE('A', '\\p{Lowercase_letter}', 'X')`, + `REGEXP_REPLACE('A', '\\p{Uppercase_letter}', 'X')`, + `REGEXP_REPLACE(1234, 12, 6)`, + `REGEXP_REPLACE(1234, 12, 6, 1)`, + `REGEXP_REPLACE(1234, 12, 6, 100)`, + `REGEXP_REPLACE(1234, 12, 6, 1, 1)`, + `REGEXP_REPLACE(1234, 12, 6, 1, 1, 'c')`, + + `REGEXP_REPLACE(NULL, 'DOG', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', NULL, 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 1, 1, NULL)`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', '1', '1', 0)`, + + `REGEXP_REPLACE(NULL, _latin1'DOG', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 1, 1, NULL)`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', '1', '1', 0)`, + + `REGEXP_REPLACE(NULL, '[', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', '[', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', '[', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', '[', 'bar', 1, 1, NULL)`, + + `REGEXP_REPLACE(NULL, _latin1'[', 'bar', 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'[', NULL, 1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'[', 'bar', 1, NULL, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'[', 'bar', 1, 1, NULL)`, + + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 0, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('', 'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('', 'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', 'DOG', 'bar', 1, 1, 0)`, + + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 0, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('', _latin1'DOG', 'bar', -1, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('', _latin1'DOG', 'bar', 100, 1, 'i')`, + `REGEXP_REPLACE('dog cat dog', _latin1'DOG', 'bar', 1, 1, 0)`, + + `REGEXP_REPLACE(' ', ' ', 'x', 1)`, + `REGEXP_REPLACE(' ', ' ', 'x', 2)`, + `REGEXP_REPLACE(' ', ' ', 'x', 3)`, + + `REGEXP_REPLACE(' ', _latin1' ', 'x', 1)`, + `REGEXP_REPLACE(' ', _latin1' ', 'x', 2)`, + `REGEXP_REPLACE(' ', _latin1' ', 'x', 3)`, + } + + for _, q := range mysqlDocSamples { + yield(q, nil) + } +} diff --git a/go/vt/vtgate/evalengine/testcases/inputs.go b/go/vt/vtgate/evalengine/testcases/inputs.go index 47f50b677c5..5785375955f 100644 --- a/go/vt/vtgate/evalengine/testcases/inputs.go +++ b/go/vt/vtgate/evalengine/testcases/inputs.go @@ -133,6 +133,41 @@ var inputConversions = []string{ "cast(time '12:34:56' as json)", "cast(time '12:34:58' as json)", "cast(time '5 12:34:58' as json)", } +var regexInputs = []string{ + "0", "1", "' 0 '", `'\t1foo\t'`, + `'foobar'`, `_utf8 'foobar'`, `''`, `_binary 'foobar'`, + `0x0`, `0x1`, `0xff`, + "NULL", "true", "false", + "0xFF666F6F626172FF", + "time '10:04:58'", "date '2000-01-01'", + "timestamp '2000-01-01 10:34:58'", + "cast(0 as json)", "cast(1 as json)", + "cast(true as json)", "cast(false as json)", + // JSON numbers + "cast(2 as json)", "cast(1.1 as json)", "cast(-1.1 as json)", + // JSON strings + "cast('\"foo\"' as json)", + // JSON binary values + "cast(_binary' \"foo\"' as json)", + "cast(0xFF666F6F626172FF as json)", + "cast(0b01 as json)", + // JSON arrays + "cast('[\"a\"]' as json)", + // JSON objects + "cast('{\"a\": 1, \"b\": 2}' as json)", +} + +var regexMatchStrings = []string{ + "NULL", + "'c'", "'i'", "'m'", "'n'", "'u'", "'cimnu'", "'cimnuunmic'", +} + +var regexCounters = []string{ + "NULL", + "0", "1", "5", "100000", + "'2'", "0.4", "0.5", "0x1", +} + const inputPi = "314159265358979323846264338327950288419716939937510582097494459" var inputStrings = []string{ diff --git a/go/vt/vtgate/evalengine/translate.go b/go/vt/vtgate/evalengine/translate.go index 7690201f2a3..8cc6df7bd02 100644 --- a/go/vt/vtgate/evalengine/translate.go +++ b/go/vt/vtgate/evalengine/translate.go @@ -75,6 +75,14 @@ func (ast *astCompiler) translateComparisonExpr2(op sqlparser.ComparisonExprOper return &LikeExpr{BinaryExpr: binaryExpr}, nil case sqlparser.NotLikeOp: return &LikeExpr{BinaryExpr: binaryExpr, Negate: true}, nil + case sqlparser.RegexpOp, sqlparser.NotRegexpOp: + return &builtinRegexpLike{ + CallExpr: CallExpr{ + Arguments: []Expr{left, right}, + Method: "REGEXP_LIKE", + }, + Negate: op == sqlparser.NotRegexpOp, + }, nil default: return nil, vterrors.Errorf(vtrpcpb.Code_UNIMPLEMENTED, op.ToString()) } diff --git a/go/vt/vtgate/evalengine/translate_builtin.go b/go/vt/vtgate/evalengine/translate_builtin.go index fb6f988af7d..49784973180 100644 --- a/go/vt/vtgate/evalengine/translate_builtin.go +++ b/go/vt/vtgate/evalengine/translate_builtin.go @@ -765,6 +765,167 @@ func (ast *astCompiler) translateCallable(call sqlparser.Callable) (Expr, error) collate: ast.cfg.Collation, }, nil + case *sqlparser.RegexpLikeExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern} + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpLike{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_LIKE"}, + Negate: false, + }, nil + + case *sqlparser.RegexpInstrExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern} + + if call.Position != nil { + position, err := ast.translateExpr(call.Position) + if err != nil { + return nil, err + } + args = append(args, position) + } + + if call.Occurrence != nil { + occurrence, err := ast.translateExpr(call.Occurrence) + if err != nil { + return nil, err + } + args = append(args, occurrence) + } + + if call.ReturnOption != nil { + returnOption, err := ast.translateExpr(call.ReturnOption) + if err != nil { + return nil, err + } + args = append(args, returnOption) + } + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpInstr{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_INSTR"}, + }, nil + + case *sqlparser.RegexpSubstrExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern} + + if call.Position != nil { + position, err := ast.translateExpr(call.Position) + if err != nil { + return nil, err + } + args = append(args, position) + } + + if call.Occurrence != nil { + occurrence, err := ast.translateExpr(call.Occurrence) + if err != nil { + return nil, err + } + args = append(args, occurrence) + } + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpSubstr{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_SUBSTR"}, + }, nil + + case *sqlparser.RegexpReplaceExpr: + input, err := ast.translateExpr(call.Expr) + if err != nil { + return nil, err + } + + pattern, err := ast.translateExpr(call.Pattern) + if err != nil { + return nil, err + } + + repl, err := ast.translateExpr(call.Repl) + if err != nil { + return nil, err + } + + args := []Expr{input, pattern, repl} + + if call.Position != nil { + position, err := ast.translateExpr(call.Position) + if err != nil { + return nil, err + } + args = append(args, position) + } + + if call.Occurrence != nil { + occurrence, err := ast.translateExpr(call.Occurrence) + if err != nil { + return nil, err + } + args = append(args, occurrence) + } + + if call.MatchType != nil { + matchType, err := ast.translateExpr(call.MatchType) + if err != nil { + return nil, err + } + args = append(args, matchType) + } + + return &builtinRegexpReplace{ + CallExpr: CallExpr{Arguments: args, Method: "REGEXP_REPLACE"}, + }, nil default: return nil, translateExprNotSupported(call) } diff --git a/go/vt/vttablet/tabletmanager/vreplication/utils.go b/go/vt/vttablet/tabletmanager/vreplication/utils.go index 02bcbb235be..1e26687e147 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/utils.go +++ b/go/vt/vttablet/tabletmanager/vreplication/utils.go @@ -155,6 +155,26 @@ func isUnrecoverableError(err error) bool { mysql.ERInvalidJSONTextInParams, mysql.ERJSONDocumentTooDeep, mysql.ERJSONValueTooBig, + mysql.ERRegexpError, + mysql.ERRegexpStringNotTerminated, + mysql.ERRegexpIllegalArgument, + mysql.ERRegexpIndexOutOfBounds, + mysql.ERRegexpInternal, + mysql.ERRegexpRuleSyntax, + mysql.ERRegexpBadEscapeSequence, + mysql.ERRegexpUnimplemented, + mysql.ERRegexpMismatchParen, + mysql.ERRegexpBadInterval, + mysql.ERRRegexpMaxLtMin, + mysql.ERRegexpInvalidBackRef, + mysql.ERRegexpLookBehindLimit, + mysql.ERRegexpMissingCloseBracket, + mysql.ERRegexpInvalidRange, + mysql.ERRegexpStackOverflow, + mysql.ERRegexpTimeOut, + mysql.ERRegexpPatternTooBig, + mysql.ERRegexpInvalidCaptureGroup, + mysql.ERRegexpInvalidFlag, mysql.ERNoDefault, mysql.ERNoDefaultForField, mysql.ERNonUniq,