Merge pull request #7 from tom-lord/OrGroup_random_example_probabilit…

…y_distribution Or group random example probability distribution
tom-lord · Jul 17, 2015 · a0d3cf8 · a0d3cf8
2 parents 0ef73be + 28cbfcb
commit a0d3cf8
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 54 deletions.
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -0,0 +1,2 @@
+Metrics/LineLength:
+  Max: 90
diff --git a/lib/regexp-examples/chargroup_parser.rb b/lib/regexp-examples/chargroup_parser.rb
@@ -25,29 +25,11 @@ def parse
       until next_char == ']'
         case next_char
         when '['
-          @current_position += 1
-          sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
-          @charset.concat sub_group_parser.result
-          @current_position += sub_group_parser.length
+          parse_sub_group_concat
         when '-'
-          if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range!
-            @charset << '-'
-            @current_position += 1
-          else
-            @current_position += 1
-            @charset.concat (@charset.last..parse_checking_backlash.first).to_a
-            @current_position += 1
-          end
+          parse_after_hyphen
         when '&'
-          if regexp_string[@current_position + 1] == '&'
-            @current_position += 2
-            sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
-            @charset &= sub_group_parser.result
-            @current_position += (sub_group_parser.length - 1)
-          else
-            @charset << '&'
-            @current_position += 1
-          end
+          parse_after_ampersand
         else
           @charset.concat parse_checking_backlash
           @current_position += 1
@@ -116,6 +98,40 @@ def parse_after_backslash
       end
     end
 
+    def parse_sub_group_concat
+      @current_position += 1
+      sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
+      @charset.concat sub_group_parser.result
+      @current_position += sub_group_parser.length
+    end
+
+    def parse_after_ampersand
+      if regexp_string[@current_position + 1] == '&'
+        parse_sub_group_intersect
+      else
+        @charset << '&'
+        @current_position += 1
+      end
+    end
+
+    def parse_sub_group_intersect
+      @current_position += 2
+      sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
+      @charset &= sub_group_parser.result
+      @current_position += (sub_group_parser.length - 1)
+    end
+
+    def parse_after_hyphen
+      if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range!
+        @charset << '-'
+        @current_position += 1
+      else
+        @current_position += 1
+        @charset.concat (@charset.last..parse_checking_backlash.first).to_a
+        @current_position += 1
+      end
+    end
+
     def rest_of_string
       regexp_string[@current_position..-1]
     end

diff --git a/lib/regexp-examples/groups.rb b/lib/regexp-examples/groups.rb
@@ -153,38 +153,45 @@ def result_by_method(method)
   end
 
   # A boolean "or" group.
-  # It really is boolean: The implementation is to pass in 2 set of
-  # (repeaters of) groups. The simplest example is: /a|b/
-  # If you have more than one boolean "or" operator, then this is
-  # constructed using multiple *boolean* OrGroups, e.g.
-  # /a|b|c|d/ is treated like /((a|b)|c)|d/
+  # The implementation is to pass in 2 set of (repeaters of) groups.
+  # The simplest example is: /a|b/
+  # If you have more than one boolean "or" operator, then this is initially
+  # parsed as an OrGroup containing another OrGroup. However, in order to avoid
+  # probability distribution issues in Regexp#random_example, this then gets
+  # simplified down to one OrGroup containing 3+ repeaters.
   class OrGroup
+    attr_reader :repeaters_list
+
     def initialize(left_repeaters, right_repeaters)
-      @left_repeaters = left_repeaters
-      @right_repeaters = right_repeaters
+      @repeaters_list = [left_repeaters, *merge_if_orgroup(right_repeaters)]
     end
 
     def result
       result_by_method(:map_results)
     end
 
     def random_result
-      # TODO: This logic is flawed in terms of choosing a truly "random" example! E.g.
-      # /a|b|c|d/.random_example will choose a letter with the following probabilities:
-      # a = 50%, b = 25%, c = 12.5%, d = 12.5%
-      # In order to fix this, I must either apply some weighted selection logic,
-      # or change how the OrGroup examples are generated
-      # - i.e. make this class work with >2 repeaters
       result_by_method(:map_random_result).sample(1)
     end
 
     private
 
     def result_by_method(method)
-      left_result = RegexpExamples.public_send(method, @left_repeaters)
-      right_result = RegexpExamples.public_send(method, @right_repeaters)
-      left_result.concat(right_result).flatten.uniq.map do |result|
-        GroupResult.new(result)
+      repeaters_list.map do |repeaters|
+        RegexpExamples.public_send(method, repeaters)
+      end
+        .inject(:concat)
+        .map do |result|
+          GroupResult.new(result)
+        end
+        .uniq
+    end
+
+    def merge_if_orgroup(repeaters)
+      if repeaters.size == 1 && repeaters.first.is_a?(OrGroup)
+        repeaters.first.repeaters_list
+      else
+        [repeaters]
       end
     end
   end

diff --git a/lib/regexp-examples/parser.rb b/lib/regexp-examples/parser.rb
@@ -15,7 +15,7 @@ def parse
       repeaters = []
       until end_of_regexp
         group = parse_group(repeaters)
-        return [OneTimeRepeater.new(group)] if group.is_a? OrGroup
+        return [group] if group.is_a? OrGroup
         @current_position += 1
         repeaters << parse_repeater(group)
       end
@@ -148,7 +148,7 @@ def parse_after_backslash_group
         ) # Using "\r\n" as one character is little bit hacky...
       when next_char == 'g' # Subexpression call
         fail IllegalSyntaxError,
-          'Subexpression calls (\\g) cannot be supported, as they are not regular'
+             'Subexpression calls (\\g) cannot be supported, as they are not regular'
       when next_char =~ /[bB]/ # Anchors
         raise_anchors_exception!
       when next_char =~ /[AG]/ # Start of string
@@ -159,6 +159,7 @@ def parse_after_backslash_group
         end
       when next_char =~ /[zZ]/ # End of string
         if @current_position == (regexp_string.length - 1)
+          # TODO: /\Z/ should be treated as /\n?/
           group = PlaceHolderGroup.new
         else
           raise_anchors_exception!
@@ -212,10 +213,10 @@ def parse_multi_group
             end
           when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
             fail IllegalSyntaxError,
-              'Lookaheads are not regular; cannot generate examples'
+                 'Lookaheads are not regular; cannot generate examples'
           when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
             fail IllegalSyntaxError,
-              'Lookbehinds are not regular; cannot generate examples'
+                 'Lookbehinds are not regular; cannot generate examples'
           else # e.g. /(?<name>namedgroup)/
             @current_position += (match[3].length + 3)
             group_id = match[3]
@@ -237,12 +238,14 @@ def remember_old_regexp_options
     end
 
     def regexp_options_toggle(on, off)
-      @ignorecase = true if on.include? 'i'
-      @ignorecase = false if off.include? 'i'
-      @multiline = true if on.include? 'm'
-      @multiline = false if off.include? 'm'
-      @extended = true if on.include? 'x'
-      @extended = false if off.include? 'x'
+      regexp_option_toggle(on, off, '@ignorecase', 'i')
+      regexp_option_toggle(on, off, '@multiline', 'm')
+      regexp_option_toggle(on, off, '@extended', 'x')
+    end
+
+    def regexp_option_toggle(on, off, var, char)
+      instance_variable_set(var, true) if on.include? char
+      instance_variable_set(var, false) if off.include? char
     end
 
     def parse_char_group
@@ -327,7 +330,7 @@ def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
 
     def raise_anchors_exception!
       fail IllegalSyntaxError,
-        "Anchors ('#{next_char}') cannot be supported, as they are not regular"
+           "Anchors ('#{next_char}') cannot be supported, as they are not regular"
     end
 
     def parse_one_time_repeater(group)

diff --git a/lib/regexp-examples/version.rb b/lib/regexp-examples/version.rb
@@ -1,3 +1,3 @@
 module RegexpExamples
-  VERSION = '1.1.2'
+  VERSION = '1.1.3'
 end
diff --git a/spec/regexp-examples_spec.rb b/spec/regexp-examples_spec.rb
@@ -4,8 +4,9 @@ def self.examples_exist_and_match(*regexps)
       it "examples for /#{regexp.source}/" do
         regexp_examples = regexp.examples(max_group_results: 99_999)
 
-        expect(regexp_examples).not_to be_empty,
-          "No examples were generated for regexp: /#{regexp.source}/"
+        expect(regexp_examples)
+          .not_to be_empty,
+            "No examples were generated for regexp: /#{regexp.source}/"
         regexp_examples.each do |example|
           expect(example).to match(/\A(?:#{regexp.source})\z/)
         end
@@ -205,8 +206,9 @@ def self.examples_are_empty(*regexps)
       ).each do |property|
         it "examples for /\p{#{property}}/" do
           regexp_examples = /\p{#{property}}/.examples(max_group_results: 99_999)
-          expect(regexp_examples).not_to be_empty,
-            "No examples were generated for regexp: /\p{#{property}}/"
+          expect(regexp_examples)
+            .not_to be_empty,
+              "No examples were generated for regexp: /\p{#{property}}/"
           # Just do one big check, for test system performance (~30% faster)
           # (Otherwise, we're doing up to 128 checks on 123 properties!!!)
           expect(regexp_examples.join('')).to match(/\A\p{#{property}}+\z/)
@@ -301,6 +303,10 @@ def self.examples_are_empty(*regexps)
         it { expect(/(a|b){2}/.examples).to match_array %w(aa ab ba bb) }
         it { expect(/a+|b?/.examples).to match_array ['a', 'aa', 'aaa', '', 'b'] }
 
+        # Only display unique examples:
+        it { expect(/a|a|b|b/.examples).to match_array ['a', 'b'] }
+        it { expect(/[ccdd]/.examples).to match_array ['c', 'd'] }
+
         # a{1}? should be equivalent to (?:a{1})?, i.e. NOT a "non-greedy quantifier"
         it { expect(/a{1}?/.examples).to match_array ['', 'a'] }
       end