From 793c5df7c3003780cc2ec4d57504279107166475 Mon Sep 17 00:00:00 2001 From: Tom Lord Date: Sun, 22 Feb 2015 23:10:54 +0000 Subject: [PATCH] Support for POSIX groups Also some general improvements to character group parsing, and laid the foundations for the big CharGroup refactor --- README.md | 4 +- lib/regexp-examples/chargroup_parser.rb | 70 +++++++++++++++++++++++++ lib/regexp-examples/constants.rb | 24 ++++----- lib/regexp-examples/groups.rb | 60 +-------------------- lib/regexp-examples/parser.rb | 10 ++-- spec/regexp-examples_spec.rb | 6 +-- 6 files changed, 95 insertions(+), 79 deletions(-) create mode 100644 lib/regexp-examples/chargroup_parser.rb diff --git a/README.md b/README.md index e2ca859..c27f83c 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ For more detail on this, see [configuration options](#configuration-options). * Escape sequences, e.g. `/\x42/`, `/\x5word/`, `/#{"\x80".force_encoding("ASCII-8BIT")}/` * Unicode characters, e.g. `/\u0123/`, `/\uabcd/`, `/\u{789}/` * Octal characters, e.g. `/\10/`, `/\177/` +* POSIX bracket expressions (including negation), e.g. `/[[:alnum:]]/`, `/[[:^space:]]/` * **Arbitrarily complex combinations of all the above!** * Regexp options can also be used: @@ -54,14 +55,13 @@ For more detail on this, see [configuration options](#configuration-options). ## Bugs and Not-Yet-Supported syntax * Nested character classes, and the use of set intersection ([See here](http://www.ruby-doc.org/core-2.2.0/Regexp.html#class-Regexp-label-Character+Classes) for the official documentation on this.) For example: - * `/[[abc]]/.examples` (which _should_ return `["a", "b", "c"]`) + * `/[[abc]de]/.examples` (which _should_ return `["a", "b", "c", "d", "e"]`) * `/[[a-d]&&[c-f]]/.examples` (which _should_ return: `["c", "d"]`) * Conditional capture groups, such as `/(group1) (?(1)yes|no)` Using any of the following will raise a RegexpExamples::UnsupportedSyntax exception (until such time as they are implemented!): -* POSIX bracket expressions, e.g. `/[[:alnum:]]/`, `/[[:space:]]/` * Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter") * Subexpression calls, e.g. `/(? ... \g* )/` (Note: These could get _really_ ugly to implement, and may even be impossible, so I highly doubt it's worth the effort!) diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb new file mode 100644 index 0000000..448226f --- /dev/null +++ b/lib/regexp-examples/chargroup_parser.rb @@ -0,0 +1,70 @@ +module RegexpExamples + # Given an array of chars from inside a character set, + # Interprets all backslashes, ranges and negations + # TODO: This needs a bit of a rewrite because: + # A) It's ugly + # B) It doesn't take into account nested character groups, or set intersection + # To achieve this, the algorithm needs to be recursive, like the main Parser. + class ChargroupParser + def initialize(chars) + @chars = chars + if @chars[0] == "^" + @negative = true + @chars = @chars[1..-1] + else + @negative = false + end + + init_backslash_chars + init_ranges + end + + def result + @negative ? (CharSets::Any - @chars) : @chars + end + + private + def init_backslash_chars + @chars.each_with_index do |char, i| + if char == "\\" + if BackslashCharMap.keys.include?(@chars[i+1]) + @chars[i..i+1] = move_backslash_to_front( BackslashCharMap[@chars[i+1]] ) + elsif @chars[i+1] == 'b' + @chars[i..i+1] = "\b" + elsif @chars[i+1] == "\\" + @chars.delete_at(i+1) + else + @chars.delete_at(i) + end + end + end + end + + def init_ranges + # remove hyphen ("-") from front/back, if present + hyphen = nil + hyphen = @chars.shift if @chars.first == "-" + hyphen ||= @chars.pop if @chars.last == "-" + # Replace all instances of e.g. ["a", "-", "z"] with ["a", "b", ..., "z"] + while i = @chars.index("-") + # Prevent infinite loops from expanding [",", "-", "."] to itself + # (Since ",".ord = 44, "-".ord = 45, ".".ord = 46) + if (@chars[i-1] == ',' && @chars[i+1] == '.') + hyphen = @chars.delete_at(i) + else + @chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a + end + end + # restore hyphen, if stripped out earlier + @chars.unshift(hyphen) if hyphen + end + + def move_backslash_to_front(chars) + if index = chars.index { |char| char == '\\' } + chars.unshift chars.delete_at(index) + end + chars + end + end +end + diff --git a/lib/regexp-examples/constants.rb b/lib/regexp-examples/constants.rb index 279fd6c..cc0b448 100644 --- a/lib/regexp-examples/constants.rb +++ b/lib/regexp-examples/constants.rb @@ -32,17 +32,17 @@ def self.MaxGroupResults end module CharSets - Lower = Array('a'..'z') - Upper = Array('A'..'Z') - Digit = Array('0'..'9') - # Chars in ranges: [33..47, 58..64, 91..96, 123..126] - Punct = %w(] [ ! " # $ % & ' ( ) * + , . / : ; < = > ? @ \\ ^ _ ` { | } ~ -) - Hex = Array('a'..'f') | Array('A'..'F') | Digit - Word = Lower | Upper | Digit | ['_'] - Whitespace = [' ', "\t", "\n", "\r", "\v", "\f"] - Control = (0..31).map(&:chr) | ["\x7f"] - # Ensure that the "common" characters appear first in the array. Do not include "\n"! - Any = Lower | Upper | Digit | Punct | (0..255).map(&:chr) - ["\n"] + Lower = Array('a'..'z') + Upper = Array('A'..'Z') + Digit = Array('0'..'9') + Punct = %w(! " # % & ' ( ) * , - . / : ; ? @ [ \\ ] _ { }) + Hex = Array('a'..'f') | Array('A'..'F') | Digit + Word = Lower | Upper | Digit | ['_'] + Whitespace = [' ', "\t", "\n", "\r", "\v", "\f"] + Control = (0..31).map(&:chr) | ["\x7f"] + # Ensure that the "common" characters appear first in the array + Any = Lower | Upper | Digit | Punct | (0..127).map(&:chr) + AnyNoNewLine = Any - ["\n"] end.freeze # Map of special regex characters, to their associated character sets @@ -79,7 +79,7 @@ module CharSets 'upper' => CharSets::Upper, 'xdigit' => CharSets::Hex, 'word' => CharSets::Word, - 'ascii' => CharSets::Any | ["\n"], + 'ascii' => CharSets::Any }.freeze end diff --git a/lib/regexp-examples/groups.rb b/lib/regexp-examples/groups.rb index fe46af8..abb719c 100644 --- a/lib/regexp-examples/groups.rb +++ b/lib/regexp-examples/groups.rb @@ -63,69 +63,14 @@ class CharGroup def initialize(chars, ignorecase) @chars = chars @ignorecase = ignorecase - if chars[0] == "^" - @negative = true - @chars = @chars[1..-1] - else - @negative = false - end - - init_backslash_chars - init_ranges - end - - def init_ranges - # save first and last "-" if present - - first = nil - last = nil - first = @chars.shift if @chars.first == "-" - last = @chars.pop if @chars.last == "-" - # Replace all instances of e.g. ["a", "-", "z"] with ["a", "b", ..., "z"] - while i = @chars.index("-") - # Prevent infinite loops from expanding [",", "-", "."] to itself - # (Since ",".ord = 44, "-".ord = 45, ".".ord = 46) - if (@chars[i-1] == ',' && @chars[i+1] == '.') - first = '-' - @chars.delete_at(i) - else - @chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a - end - end - # restore them back - @chars.unshift(first) if first - @chars.push(last) if last - end - - def init_backslash_chars - @chars.each_with_index do |char, i| - if char == "\\" - if BackslashCharMap.keys.include?(@chars[i+1]) - @chars[i..i+1] = move_backslash_to_front( BackslashCharMap[@chars[i+1]] ) - elsif @chars[i+1] == 'b' - @chars[i..i+1] = "\b" - elsif @chars[i+1] == "\\" - @chars.delete_at(i+1) - else - @chars.delete_at(i) - end - end - end end def result - (@negative ? (CharSets::Any - @chars) : @chars).map do |result| + @chars.map do |result| GroupResult.new(result) end end - private - def move_backslash_to_front(chars) - if index = chars.index { |char| char == '\\' } - chars.unshift chars.delete_at(index) - end - chars - end end class DotGroup @@ -135,8 +80,7 @@ def initialize(multiline) end def result - chars = CharSets::Any - chars = (["\n"] | chars) if multiline + chars = multiline ? CharSets::Any : CharSets::AnyNoNewLine chars.map do |result| GroupResult.new(result) end diff --git a/lib/regexp-examples/parser.rb b/lib/regexp-examples/parser.rb index 6d70357..ea27c5f 100644 --- a/lib/regexp-examples/parser.rb +++ b/lib/regexp-examples/parser.rb @@ -218,8 +218,11 @@ def parse_multi_end_group end def parse_char_group - if rest_of_string =~ /\A\[\[:[^:]+:\]\]/ - raise UnsupportedSyntaxError, "POSIX bracket expressions are not yet implemented" + # TODO: Extract all this logic into ChargroupParser + if rest_of_string =~ /\A\[\[:(\^?)([^:]+):\]\]/ + @current_position += (6 + $1.length + $2.length) + chars = $1.empty? ? POSIXCharMap[$2] : CharSets::Any - POSIXCharMap[$2] + return CharGroup.new(chars, @ignorecase) end chars = [] @current_position += 1 @@ -238,7 +241,8 @@ def parse_char_group chars << next_char @current_position += 1 end - CharGroup.new(chars, @ignorecase) + parsed_chars = ChargroupParser.new(chars).result + CharGroup.new(parsed_chars, @ignorecase) end def parse_dot_group diff --git a/spec/regexp-examples_spec.rb b/spec/regexp-examples_spec.rb index 45e54cc..0ff7254 100644 --- a/spec/regexp-examples_spec.rb +++ b/spec/regexp-examples_spec.rb @@ -3,7 +3,7 @@ def self.examples_exist_and_match(*regexps) regexps.each do |regexp| it do begin - regexp_examples = regexp.examples + regexp_examples = regexp.examples(max_group_results: 999) rescue # TODO: Find a nicer way to display this? puts "Error generating examples for /#{regexp.source}/" @@ -187,8 +187,7 @@ def self.examples_are_empty(*regexps) /\p{L}/, /\p{Arabic}/, /\p{^Ll}/, - /(? ... \g*)/, - /[[:space:]]/ + /(? ... \g*)/ ) end @@ -244,7 +243,6 @@ def self.examples_are_empty(*regexps) end context "for POSIX groups" do - before { pending "TODO: POSIX Groups" } examples_exist_and_match( /[[:alnum:]]/, /[[:alpha:]]/,