Merge pull request #6 from tom-lord/named-property-lambda

Named properties saved in PStore, for each ruby version
tom-lord · Jul 12, 2015 · 60e660f · 60e660f
2 parents 0c8f7af + c919600
commit 60e660f
Show file tree

Hide file tree

Showing 12 changed files with 156 additions and 336 deletions.
diff --git a/db/unicode_ranges_2.0.pstore b/db/unicode_ranges_2.0.pstore
diff --git a/db/unicode_ranges_2.1.pstore b/db/unicode_ranges_2.1.pstore
@@ -0,0 +1 @@
+unicode_ranges_2.0.pstore
diff --git a/db/unicode_ranges_2.2.pstore b/db/unicode_ranges_2.2.pstore
diff --git a/...amples/core_extensions/regexp/examples.rb → lib/core_extensions/regexp/examples.rb b/...amples/core_extensions/regexp/examples.rb → lib/core_extensions/regexp/examples.rb
@@ -17,13 +17,13 @@ def random_example(**config_options)
       end
 
       private
-        def examples_by_method(method)
+      def examples_by_method(method)
         full_examples = RegexpExamples.public_send(
           method,
           RegexpExamples::Parser.new(source, options).parse
         )
         RegexpExamples::BackReferenceReplacer.new.substitute_backreferences(full_examples)
-        end
+      end
     end
   end
 end

diff --git a/lib/regexp-examples.rb b/lib/regexp-examples.rb
@@ -1,2 +1,12 @@
-Dir[File.dirname(__FILE__) + '/regexp-examples/**/*.rb'].each {|file| require file }
+require_relative "regexp-examples/unicode_char_ranges"
+require_relative "regexp-examples/backreferences"
+require_relative "regexp-examples/chargroup_parser"
+require_relative "regexp-examples/constants"
+require_relative "regexp-examples/groups"
+require_relative "regexp-examples/helpers"
+require_relative "regexp-examples/parser"
+require_relative "regexp-examples/repeaters"
+require_relative "regexp-examples/unicode_char_ranges"
+require_relative "regexp-examples/version"
+require_relative "core_extensions/regexp/examples"
 
diff --git a/lib/regexp-examples/constants.rb b/lib/regexp-examples/constants.rb
diff --git a/lib/regexp-examples/groups.rb b/lib/regexp-examples/groups.rb
@@ -28,9 +28,12 @@ module GroupWithIgnoreCase
     def result
       group_result = super
       if ignorecase
-        group_result
-          .concat( group_result.map(&:swapcase) )
-          .uniq
+        Enumerator.new do |ignorecase_group_result|
+          group_result.each do |gr|
+            ignorecase_group_result << gr
+            ignorecase_group_result << gr.swapcase
+          end
+        end.lazy
       else
         group_result
       end
@@ -39,7 +42,7 @@ def result
 
   module RandomResultBySample
     def random_result
-      result.sample(1)
+      result.force.sample(1)
     end
   end
 
@@ -51,7 +54,7 @@ def initialize(char, ignorecase)
       @ignorecase = ignorecase
     end
     def result
-      [GroupResult.new(@char)]
+      [GroupResult.new(@char)].lazy
     end
   end
 
@@ -62,7 +65,7 @@ def result
   class PlaceHolderGroup
     include RandomResultBySample
     def result
-      [GroupResult.new('')]
+      [GroupResult.new('')].lazy
     end
   end
 
@@ -75,7 +78,7 @@ def initialize(chars, ignorecase)
     end
 
     def result
-      @chars.map do |result|
+      @chars.lazy.map do |result|
         GroupResult.new(result)
       end
     end
@@ -91,7 +94,7 @@ def initialize(multiline)
 
     def result
       chars = multiline ? CharSets::Any : CharSets::AnyNoNewLine
-      chars.map do |result|
+      chars.lazy.map do |result|
         GroupResult.new(result)
       end
     end
@@ -160,7 +163,7 @@ def initialize(id)
     end
 
     def result
-      [ GroupResult.new("__#{@id}__") ]
+      [ GroupResult.new("__#{@id}__") ].lazy
     end
   end
 

diff --git a/lib/regexp-examples/repeaters.rb b/lib/regexp-examples/repeaters.rb
@@ -6,7 +6,7 @@ def initialize(group)
     end
 
     def result
-      group_results = group.result[0 .. RegexpExamples.MaxGroupResults-1]
+      group_results = group.result.first(RegexpExamples.MaxGroupResults)
       results = []
       min_repeats.upto(max_repeats) do |repeats|
         if repeats.zero?

diff --git a/lib/regexp-examples/unicode_char_ranges.rb b/lib/regexp-examples/unicode_char_ranges.rb
@@ -0,0 +1,47 @@
+require 'pstore'
+
+module RegexpExamples
+  class UnicodeCharRanges
+
+    # These values were generated by: scripts/unicode_lister.rb
+    # Note: Only the first 128 results are listed, for performance.
+    # Also, some groups seem to have no matches (weird!)
+    # (Don't care about ruby micro version number)
+    STORE_FILENAME = "unicode_ranges_#{RUBY_VERSION[0..2]}.pstore"
+
+    attr_reader :range_store
+
+    def initialize(location="db/#{STORE_FILENAME}")
+      @range_store = PStore.new(location)
+    end
+
+    def get(key)
+      range_store.transaction(true) do
+        ranges_to_unicode(range_store[key])
+      end
+    end
+
+    alias_method :[], :get
+
+    private
+
+    # TODO: Document example input/output of this method
+    # It's pretty simple, but this code is a little confusing!!
+    def ranges_to_unicode(ranges)
+      result = []
+      ranges.each do |range|
+        if range.is_a? Fixnum # Small hack to increase data compression
+          result << hex_to_unicode(range.to_s(16))
+        else
+          range.each { |num| result << hex_to_unicode(num.to_s(16)) }
+        end
+      end
+      result
+    end
+
+    def hex_to_unicode(hex)
+      eval("?\\u{#{hex}}")
+    end
+  end
+end
+
diff --git a/lib/regexp-examples/version.rb b/lib/regexp-examples/version.rb
@@ -1,3 +1,3 @@
 module RegexpExamples
-  VERSION = '1.1.0'
+  VERSION = '1.1.1'
 end
diff --git a/scripts/unicode_lister.rb b/scripts/unicode_lister.rb
@@ -1,159 +1,38 @@
+require 'pstore'
+require_relative '../lib/regexp-examples/unicode_char_ranges'
 # A script to generate lists of all unicode characters
 # that match all named group/character properties regexps.
 # For use in e.g. /\p{Arabic}/.examples
 
 # To (re-)generate this list, simply run this file!
 # > ruby scripts/unicode_lister.rb
-OutputFilename = 'unicode_result'
 
 # Taken from ruby documentation:
 # http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
 NamedGroups = %w(
-  Alnum
-  Alpha
-  Blank
-  Cntrl
-  Digit
-  Graph
-  Lower
-  Print
-  Punct
-  Space
-  Upper
-  XDigit
-  Word
-  ASCII
-  Any
-  Assigned
+  Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word ASCII Any Assigned
 
-  L
-  Ll
-  Lm
-  Lo
-  Lt
-  Lu
-  M
-  Mn
-  Mc
-  Me
-  N
-  Nd
-  Nl
-  No
-  P
-  Pc
-  Pd
-  Ps
-  Pe
-  Pi
-  Pf
-  Po
-  S
-  Sm
-  Sc
-  Sk
-  So
-  Z
-  Zs
-  Zl
-  Zp
-  C
-  Cc
-  Cf
-  Cn
-  Co
-  Cs
+  L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl Zp C Cc Cf Cn Co Cs
 
-  Arabic
-  Armenian
-  Balinese
-  Bengali
-  Bopomofo
-  Braille
-  Buginese
-  Buhid
-  Canadian_Aboriginal
-  Carian
-  Cham
-  Cherokee
-  Common
-  Coptic
-  Cuneiform
-  Cypriot
-  Cyrillic
-  Deseret
-  Devanagari
-  Ethiopic
-  Georgian
-  Glagolitic
-  Gothic
-  Greek
-  Gujarati
-  Gurmukhi
-  Han
-  Hangul
-  Hanunoo
-  Hebrew
-  Hiragana
-  Inherited
-  Kannada
-  Katakana
-  Kayah_Li
-  Kharoshthi
-  Khmer
-  Lao
-  Latin
-  Lepcha
-  Limbu
-  Linear_B
-  Lycian
-  Lydian
-  Malayalam
-  Mongolian
-  Myanmar
-  New_Tai_Lue
-  Nko
-  Ogham
-  Ol_Chiki
-  Old_Italic
-  Old_Persian
-  Oriya
-  Osmanya
-  Phags_Pa
-  Phoenician
-  Rejang
-  Runic
-  Saurashtra
-  Shavian
-  Sinhala
-  Sundanese
-  Syloti_Nagri
-  Syriac
-  Tagalog
-  Tagbanwa
-  Tai_Le
-  Tamil
-  Telugu
-  Thaana
-  Thai
-  Tibetan
-  Tifinagh
-  Ugaritic
-  Vai
-  Yi
+  Arabic Armenian Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal Carian Cham Cherokee
+  Common Coptic Cuneiform Cypriot Cyrillic Deseret Devanagari Ethiopic Georgian Glagolitic Gothic Greek
+  Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana Inherited Kannada Katakana Kayah_Li Kharoshthi Khmer
+  Lao Latin Lepcha Limbu Linear_B Lycian Lydian Malayalam Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki
+  Old_Italic Old_Persian Oriya Osmanya Phags_Pa Phoenician Rejang Runic Saurashtra Shavian Sinhala Sundanese
+  Syloti_Nagri Syriac Tagalog Tagbanwa Tai_Le Tamil Telugu Thaana Thai Tibetan Tifinagh Ugaritic Vai Yi
 )
 
-# Note: For some reason, a character encoding-related exception gets raised
-# when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
-# This means my calculation is MISSING results in the range: 55296..65535
-# However, for the sake of performance, I'm also being "lazy" and only calculating/saving
-# the first 128 matches anyway!
-# If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
+# Note: For the range 55296..57343, these are reserved values that are not legal
+# unicode characters.
+# I.e. a character encoding-related exception gets raised when you do:
+# `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")`
+# TODO: Add a link to somewhere that explains this better.
 
-# Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
-# Example output: "1..4, 6..7, 12, 14" (String)
+# "Compresses" the values in an array by using ranges.
+# Example input: [1, 2, 3, 4, 6, 7, 12, 14]
+# Example output: [1..4, 6..7, 12, 14]
 def calculate_ranges(matching_codes)
-  return "" if matching_codes.empty?
+  return [] if matching_codes.empty?
   first = matching_codes.shift
   matching_codes.inject([first..first]) do |r,x|
     if r.last.last.succ != x
@@ -162,19 +41,25 @@ def calculate_ranges(matching_codes)
       r[0..-2] << (r.last.first..x) # Update last range
     end
   end
-    .map { |range| range.size == 1 ? range.first : range}
-    .join(", ")
+    .map { |range| range.size == 1 ? range.first : range} # Replace `int..int` with `int`
 end
 
 count = 0
-File.open(OutputFilename, 'w') do |f|
+filename = "db/#{RegexpExamples::UnicodeCharRanges::STORE_FILENAME}"
+store = PStore.new(filename)
+store.transaction do
   NamedGroups.each do |name|
-  count += 1
-    matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
-    f.puts "'#{name.downcase}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
+    count += 1
+    # Only generating first 128 matches, for performance...
+    # (I have tried this with generating ALL examples, and it makes the ruby gem
+    # painfully slow and bloated... Especially the test suite.)
+    matching_codes = [(0..55295), (57344..65535)].map(&:to_a).flatten.lazy
+      .select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }
+      .first(128)
+    store[name.downcase] = calculate_ranges(matching_codes)
     puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
   end
   puts "*"*50
-  puts "Finished! Result stored in: #{OutputFilename}"
+  puts "Finished! Result stored in: #{filename}"
 end