From 229d57b784a4906237a6b8c6d5963ec1232a503e Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Sun, 1 Mar 2015 20:33:13 +0000 Subject: [PATCH 1/8] Initial sketch of refactored chargroup parser All character set parsing code pulled out into new class Still a big buggy/incomplete, but the general code structure is laid out. (E.g. does not yet work for character ranges) --- lib/regexp-examples/chargroup_parser.rb | 135 +++++++++++++++--------- lib/regexp-examples/parser.rb | 29 +---- 2 files changed, 91 insertions(+), 73 deletions(-) diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb index 448226f..aa7886e 100644 --- a/lib/regexp-examples/chargroup_parser.rb +++ b/lib/regexp-examples/chargroup_parser.rb @@ -1,69 +1,106 @@ module RegexpExamples - # Given an array of chars from inside a character set, - # Interprets all backslashes, ranges and negations - # TODO: This needs a bit of a rewrite because: - # A) It's ugly - # B) It doesn't take into account nested character groups, or set intersection - # To achieve this, the algorithm needs to be recursive, like the main Parser. + # A "sub-parser", for char groups in a regular expression + # Some examples of what this class needs to parse: + # [abc] - plain characters + # [a-z] - ranges + # [\n\b\d] - escaped characters (which may represent character sets) + # [^abc] - negated group + # [[a][bc]] - sub-groups (should match "a", "b" or "c") + # [[:lower:]] - POSIX group + # [[a-f]&&[d-z]] - set intersection (should match "d", "f" or "f") + # [[^:alpha:]&&[\n]a-c] - all of the above!!!! (should match "\n") class ChargroupParser - def initialize(chars) - @chars = chars - if @chars[0] == "^" - @negative = true - @chars = @chars[1..-1] - else - @negative = false + attr_reader :regexp_string + def initialize(regexp_string) + @regexp_string = regexp_string + @current_position = 0 + end + + def parse(is_sub_group: false) + @charset = [] + @negative = false + parse_first_chars(is_sub_group) + until next_char == "]" do + case next_char + when "\\" + @current_position += 1 + parse_after_backslash + when "[" + @current_position += 1 + sub_group_parser = self.class.new(rest_of_string) + sub_group_parser.parse(is_sub_group: true) + @charset.concat sub_group_parser.result + @current_position += sub_group_parser.length + when "-" + if regexp_string[@current_position + 1] == "]" + @charset << "-" + @current_position += 1 + else + # TODO!!! + # Add range from previous char -> next char + end + when "&" + if regexp_string[@current_position + 1] == "&" + # TODO!!! + # Set intersection... + else + @charset << "&" + @current_position += 1 + end + else + @charset << next_char + @current_position += 1 + end end - init_backslash_chars - init_ranges + @charset.uniq! + @current_position += 1 # To account for final "]" + end + + def length + @current_position end def result - @negative ? (CharSets::Any - @chars) : @chars + @negative ? (CharSets::Any - @charset) : @charset end private - def init_backslash_chars - @chars.each_with_index do |char, i| - if char == "\\" - if BackslashCharMap.keys.include?(@chars[i+1]) - @chars[i..i+1] = move_backslash_to_front( BackslashCharMap[@chars[i+1]] ) - elsif @chars[i+1] == 'b' - @chars[i..i+1] = "\b" - elsif @chars[i+1] == "\\" - @chars.delete_at(i+1) - else - @chars.delete_at(i) - end + def parse_first_chars(is_sub_group) + if next_char == '^' + @negative = true + @current_position += 1 + end + + case rest_of_string + when /\A[-\]]/ # e.g. /[]]/ (match "]") or /[-]/ (match "-") + @charset << next_char + @current_position += 1 + when /\A:([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group + if is_sub_group + @charset.concat POSIXCharMap[$1] + @current_position += ($1.length + 2) end end end - def init_ranges - # remove hyphen ("-") from front/back, if present - hyphen = nil - hyphen = @chars.shift if @chars.first == "-" - hyphen ||= @chars.pop if @chars.last == "-" - # Replace all instances of e.g. ["a", "-", "z"] with ["a", "b", ..., "z"] - while i = @chars.index("-") - # Prevent infinite loops from expanding [",", "-", "."] to itself - # (Since ",".ord = 44, "-".ord = 45, ".".ord = 46) - if (@chars[i-1] == ',' && @chars[i+1] == '.') - hyphen = @chars.delete_at(i) - else - @chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a - end + def parse_after_backslash + case next_char + when *BackslashCharMap.keys + @charset.concat BackslashCharMap[next_char] + when 'b' + @charset << "\b" + else + @charset << next_char end - # restore hyphen, if stripped out earlier - @chars.unshift(hyphen) if hyphen end - def move_backslash_to_front(chars) - if index = chars.index { |char| char == '\\' } - chars.unshift chars.delete_at(index) - end - chars + def rest_of_string + regexp_string[@current_position..-1] + end + + def next_char + regexp_string[@current_position] end end end diff --git a/lib/regexp-examples/parser.rb b/lib/regexp-examples/parser.rb index ccb6fc4..7540da4 100644 --- a/lib/regexp-examples/parser.rb +++ b/lib/regexp-examples/parser.rb @@ -223,30 +223,11 @@ def parse_multi_end_group end def parse_char_group - # TODO: Extract all this logic into ChargroupParser - if rest_of_string =~ /\A\[\[:(\^?)([^:]+):\]\]/ - @current_position += (6 + $1.length + $2.length) - chars = $1.empty? ? POSIXCharMap[$2] : CharSets::Any - POSIXCharMap[$2] - return CharGroup.new(chars, @ignorecase) - end - chars = [] - @current_position += 1 - if next_char == ']' - # Beware of the sneaky edge case: - # /[]]/ (match "]") - chars << ']' - @current_position += 1 - end - until next_char == ']' \ - && !regexp_string[0..@current_position-1].match(/[^\\](\\{2})*\\\z/) - # Beware of having an ODD number of "\" before the "]", e.g. - # /[\]]/ (match "]") - # /[\\]/ (match "\") - # /[\\\]]/ (match "\" or "]") - chars << next_char - @current_position += 1 - end - parsed_chars = ChargroupParser.new(chars).result + @current_position += 1 # Skip past opening "[" + chargroup_parser = ChargroupParser.new(rest_of_string) + chargroup_parser.parse + parsed_chars = chargroup_parser.result + @current_position += chargroup_parser.length CharGroup.new(parsed_chars, @ignorecase) end From eff1955a93cfbe88a31dc9328e59bec199647c5c Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Sun, 1 Mar 2015 20:40:20 +0000 Subject: [PATCH 2/8] Added some "wrong" code, just to stop tests freezing --- lib/regexp-examples/chargroup_parser.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb index aa7886e..383ffac 100644 --- a/lib/regexp-examples/chargroup_parser.rb +++ b/lib/regexp-examples/chargroup_parser.rb @@ -38,6 +38,8 @@ def parse(is_sub_group: false) else # TODO!!! # Add range from previous char -> next char + @charset << "-" + @current_position += 1 end when "&" if regexp_string[@current_position + 1] == "&" From 10ba5c9d82cbc7bd769a9eaa6373e42c857178ca Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Mon, 2 Mar 2015 10:31:41 +0000 Subject: [PATCH 3/8] Fixed all existing tests Now we're finally ready to add new functionality! --- lib/regexp-examples/chargroup_parser.rb | 35 +++++++++++++++---------- lib/regexp-examples/parser.rb | 2 +- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb index 383ffac..04de664 100644 --- a/lib/regexp-examples/chargroup_parser.rb +++ b/lib/regexp-examples/chargroup_parser.rb @@ -22,9 +22,6 @@ def parse(is_sub_group: false) parse_first_chars(is_sub_group) until next_char == "]" do case next_char - when "\\" - @current_position += 1 - parse_after_backslash when "[" @current_position += 1 sub_group_parser = self.class.new(rest_of_string) @@ -32,13 +29,12 @@ def parse(is_sub_group: false) @charset.concat sub_group_parser.result @current_position += sub_group_parser.length when "-" - if regexp_string[@current_position + 1] == "]" + if regexp_string[@current_position + 1] == "]" # e.g. /[abc-]/ -- not a range! @charset << "-" @current_position += 1 else - # TODO!!! - # Add range from previous char -> next char - @charset << "-" + @current_position += 1 + @charset.concat (@charset.last .. parse_checking_backlash.first).to_a @current_position += 1 end when "&" @@ -50,7 +46,7 @@ def parse(is_sub_group: false) @current_position += 1 end else - @charset << next_char + @charset.concat parse_checking_backlash @current_position += 1 end end @@ -78,22 +74,33 @@ def parse_first_chars(is_sub_group) when /\A[-\]]/ # e.g. /[]]/ (match "]") or /[-]/ (match "-") @charset << next_char @current_position += 1 - when /\A:([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group + when /\A:(\^?)([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group if is_sub_group - @charset.concat POSIXCharMap[$1] - @current_position += ($1.length + 2) + chars = $1.empty? ? POSIXCharMap[$2] : (CharSets::Any - POSIXCharMap[$2]) + @charset.concat chars + @current_position += ($1.length + $2.length + 2) end end end + # Always returns an Array, for consistency + def parse_checking_backlash + if next_char == "\\" + @current_position += 1 + parse_after_backslash + else + [next_char] + end + end + def parse_after_backslash case next_char when *BackslashCharMap.keys - @charset.concat BackslashCharMap[next_char] + BackslashCharMap[next_char] when 'b' - @charset << "\b" + ["\b"] else - @charset << next_char + [next_char] end end diff --git a/lib/regexp-examples/parser.rb b/lib/regexp-examples/parser.rb index 7540da4..776c0f2 100644 --- a/lib/regexp-examples/parser.rb +++ b/lib/regexp-examples/parser.rb @@ -227,7 +227,7 @@ def parse_char_group chargroup_parser = ChargroupParser.new(rest_of_string) chargroup_parser.parse parsed_chars = chargroup_parser.result - @current_position += chargroup_parser.length + @current_position += (chargroup_parser.length - 1) # Step back to closing "]" CharGroup.new(parsed_chars, @ignorecase) end From 8880ac6213180b7c923978bf5816df176b7c18f1 Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Mon, 2 Mar 2015 10:43:44 +0000 Subject: [PATCH 4/8] Implemented set intersection --- lib/regexp-examples/chargroup_parser.rb | 7 +++++-- spec/regexp-examples_spec.rb | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb index 04de664..6952515 100644 --- a/lib/regexp-examples/chargroup_parser.rb +++ b/lib/regexp-examples/chargroup_parser.rb @@ -39,8 +39,11 @@ def parse(is_sub_group: false) end when "&" if regexp_string[@current_position + 1] == "&" - # TODO!!! - # Set intersection... + @current_position += 2 + sub_group_parser = self.class.new(rest_of_string) + sub_group_parser.parse(is_sub_group: is_sub_group) + @charset &= sub_group_parser.result + @current_position += (sub_group_parser.length - 1) else @charset << "&" @current_position += 1 diff --git a/spec/regexp-examples_spec.rb b/spec/regexp-examples_spec.rb index 15776cf..a0f4d2d 100644 --- a/spec/regexp-examples_spec.rb +++ b/spec/regexp-examples_spec.rb @@ -69,7 +69,6 @@ def self.examples_are_empty(*regexps) context "for complex char groups (square brackets)" do examples_exist_and_match( - /[abc]/, /[a-c]/, /[abc-e]/, @@ -82,7 +81,13 @@ def self.examples_are_empty(*regexps) /[\n-\r]/, /[\-]/, /[%-+]/, # This regex is "supposed to" match some surprising things!!! - /['-.]/ # Test to ensure no "infinite loop" on character set expansion + /['-.]/, # Test to ensure no "infinite loop" on character set expansion + /[[abc]]/, # Nested groups + /[[[[abc]]]]/, + /[[a][b][c]]/, + /[[a-h]&&[f-z]]/, # Set intersection + /[[a-h]&&ab[c]]/, # Set intersection + /[[a-h]&[f-z]]/, # NOT set intersection ) end From e3eb54bd9c917888f90cc1caeaf63562af70d8d4 Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Mon, 2 Mar 2015 10:47:06 +0000 Subject: [PATCH 5/8] Subgroup logic cleanup --- lib/regexp-examples/chargroup_parser.rb | 18 +++++++++--------- lib/regexp-examples/parser.rb | 1 - 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb index 6952515..f5ca3e6 100644 --- a/lib/regexp-examples/chargroup_parser.rb +++ b/lib/regexp-examples/chargroup_parser.rb @@ -11,21 +11,22 @@ module RegexpExamples # [[^:alpha:]&&[\n]a-c] - all of the above!!!! (should match "\n") class ChargroupParser attr_reader :regexp_string - def initialize(regexp_string) + def initialize(regexp_string, is_sub_group: false) @regexp_string = regexp_string + @is_sub_group = is_sub_group @current_position = 0 + parse end - def parse(is_sub_group: false) + def parse @charset = [] @negative = false - parse_first_chars(is_sub_group) + parse_first_chars until next_char == "]" do case next_char when "[" @current_position += 1 - sub_group_parser = self.class.new(rest_of_string) - sub_group_parser.parse(is_sub_group: true) + sub_group_parser = self.class.new(rest_of_string, is_sub_group: true) @charset.concat sub_group_parser.result @current_position += sub_group_parser.length when "-" @@ -40,8 +41,7 @@ def parse(is_sub_group: false) when "&" if regexp_string[@current_position + 1] == "&" @current_position += 2 - sub_group_parser = self.class.new(rest_of_string) - sub_group_parser.parse(is_sub_group: is_sub_group) + sub_group_parser = self.class.new(rest_of_string, is_sub_group: @is_sub_group) @charset &= sub_group_parser.result @current_position += (sub_group_parser.length - 1) else @@ -67,7 +67,7 @@ def result end private - def parse_first_chars(is_sub_group) + def parse_first_chars if next_char == '^' @negative = true @current_position += 1 @@ -78,7 +78,7 @@ def parse_first_chars(is_sub_group) @charset << next_char @current_position += 1 when /\A:(\^?)([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group - if is_sub_group + if @is_sub_group chars = $1.empty? ? POSIXCharMap[$2] : (CharSets::Any - POSIXCharMap[$2]) @charset.concat chars @current_position += ($1.length + $2.length + 2) diff --git a/lib/regexp-examples/parser.rb b/lib/regexp-examples/parser.rb index 776c0f2..f3ff6e6 100644 --- a/lib/regexp-examples/parser.rb +++ b/lib/regexp-examples/parser.rb @@ -225,7 +225,6 @@ def parse_multi_end_group def parse_char_group @current_position += 1 # Skip past opening "[" chargroup_parser = ChargroupParser.new(rest_of_string) - chargroup_parser.parse parsed_chars = chargroup_parser.result @current_position += (chargroup_parser.length - 1) # Step back to closing "]" CharGroup.new(parsed_chars, @ignorecase) From a1daed14a4baf9bc1ea1ea27d886c67cbba6166a Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Mon, 2 Mar 2015 11:18:19 +0000 Subject: [PATCH 6/8] Updated supported syntax, bugs and TODOs --- README.md | 56 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index a9902e8..026c421 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,33 @@ For more detail on this, see [configuration options](#configuration-options). /what about (backreferences\?) \1/.examples #=> ['what about backreferences? backreferences?'] ``` +## Installation + +Add this line to your application's Gemfile: + +```ruby +gem 'regexp-examples' +``` + +And then execute: + + $ bundle + +Or install it yourself as: + + $ gem install regexp-examples + ## Supported syntax * All forms of repeaters (quantifiers), e.g. `/a*/`, `/a+/`, `/a?/`, `/a{1,4}/`, `/a{3,}/`, `/a{,2}/` * Reluctant and possissive repeaters work fine, too - e.g. `/a*?/`, `/a*+/` * Boolean "Or" groups, e.g. `/a|b|c/` -* Character sets (inluding ranges and negation!), e.g. `/[abc]/`, `/[A-Z0-9]/`, `/[^a-z]/`, `/[\w\s\b]/` +* Character sets e.g. `/[abc]/` - including: + * Ranges, e.g.`/[A-Z0-9]/` + * Negation, e.g. `/[^a-z]/` + * Escaped characters, e.g. `/[\w\s\b]/` + * POSIX bracket expressions, e.g. `/[[:alnum:]]/`, `/[[:^space:]]/` + * Set intersection, e.g. `/[[a-h]&&[f-z]]/` * Escaped characters, e.g. `/\n/`, `/\w/`, `/\D/` (and so on...) * Capture groups, e.g. `/(group)/` * Including named groups, e.g. `/(?group)/` @@ -43,7 +64,6 @@ For more detail on this, see [configuration options](#configuration-options). * Escape sequences, e.g. `/\x42/`, `/\x5word/`, `/#{"\x80".force_encoding("ASCII-8BIT")}/` * Unicode characters, e.g. `/\u0123/`, `/\uabcd/`, `/\u{789}/` * Octal characters, e.g. `/\10/`, `/\177/` -* POSIX bracket expressions (including negation), e.g. `/[[:alnum:]]/`, `/[[:^space:]]/` * Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter") * **Arbitrarily complex combinations of all the above!** @@ -55,15 +75,12 @@ For more detail on this, see [configuration options](#configuration-options). ## Bugs and Not-Yet-Supported syntax -* Nested character classes, and the use of set intersection ([See here](http://www.ruby-doc.org/core-2.2.0/Regexp.html#class-Regexp-label-Character+Classes) for the official documentation on this.) For example: - * `/[[abc]de]/.examples` (which _should_ return `["a", "b", "c", "d", "e"]`) - * `/[[a-d]&&[c-f]]/.examples` (which _should_ return: `["c", "d"]`) +* There are some (rare) edge cases where backreferences do not work properly, e.g. `/(a*)a* \1/.examples` - which includes "aaaa aa". This is because each repeater is not context-aware, so the "greediness" logic is flawed. (E.g. in this case, the second `a*` should always evaluate to an empty string, because the previous `a*` was greedy! However, patterns like this are highly unusual... +* Some named properties, e.g. `/\p{Arabic}/`, list non-matching examples for ruby 2.0/2.1 (as the definitions changed in ruby 2.2). This would be "easy" to fix, but I can't be bothered... Feel free to make a pull request! -* Conditional capture groups, such as `/(group1) (?(1)yes|no)` - -* Some named properties, e.g. `/\p{Arabic}/`, list non-matching examples for ruby 2.0/2.1. There are no known issues in ruby 2.2 - -There are loads more (increasingly obscure) unsupported bits of syntax, which I cannot be bothered to write out here. Full documentation on all the various other obscurities in the ruby (version 2.x) regexp parser can be found [here](https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/RE). +There are also some various (increasingly obscure) unsupported bits of syntax, which I cannot be bothered to write out fully here. Full documentation on all the intricate obscurities in the ruby (version 2.x) regexp parser can be found [here](https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/RE). To name a couple: +* Conditional capture groups, e.g. `/(group1)? (?(1)yes|no)/.examples` (which *should* return: `["group1 yes", " no"]`) +* Back reference by relalitve group number, e.g. `/(a)(b)(c)(d) \k<-2>` (which *should* return: `["abcd c"]`) ## Impossible features ("illegal syntax") @@ -117,21 +134,12 @@ A more sensible use case might be, for example, to generate one random 1-4 digit (Note: I may develop a much more efficient way to "generate one example" in a later release of this gem.) -## Installation - -Add this line to your application's Gemfile: - -```ruby -gem 'regexp-examples' -``` - -And then execute: - - $ bundle - -Or install it yourself as: +## TODO - $ gem install regexp-examples +* Performance improvements: + * Use of lambdas/something (in [constants.rb](lib/regexp-examples/constants.rb)) to improve the library load time. + * (Maybe?) add a `max_examples` configuration option and use lazy evaluation, to ensure the method never "freezes" +* Write a blog post about how this amazing gem works! :) ## Contributing From a7fa8b99c6874d2d8fb187d0cc8e03fde8960b07 Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Mon, 2 Mar 2015 11:22:10 +0000 Subject: [PATCH 7/8] Fixed typos --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 026c421..b51411b 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ Or install it yourself as: There are also some various (increasingly obscure) unsupported bits of syntax, which I cannot be bothered to write out fully here. Full documentation on all the intricate obscurities in the ruby (version 2.x) regexp parser can be found [here](https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/RE). To name a couple: * Conditional capture groups, e.g. `/(group1)? (?(1)yes|no)/.examples` (which *should* return: `["group1 yes", " no"]`) -* Back reference by relalitve group number, e.g. `/(a)(b)(c)(d) \k<-2>` (which *should* return: `["abcd c"]`) +* Back reference by relatve group number, e.g. `/(a)(b)(c)(d) \k<-2>/.examples` (which *should* return: `["abcd c"]`) ## Impossible features ("illegal syntax") From b4014148e74981c7adb47766c4f275b4f7461006 Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Mon, 2 Mar 2015 11:22:29 +0000 Subject: [PATCH 8/8] Version bump (v1.0.0) :gem: VERSION 1 RELEASED!! :gem: --- lib/regexp-examples/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/regexp-examples/version.rb b/lib/regexp-examples/version.rb index aeff39e..2d329fb 100644 --- a/lib/regexp-examples/version.rb +++ b/lib/regexp-examples/version.rb @@ -1,3 +1,3 @@ module RegexpExamples - VERSION = '0.7.0' + VERSION = '1.0.0' end