Skip to content

Commit

Permalink
Merge pull request #6 from tom-lord/named-property-lambda
Browse files Browse the repository at this point in the history
Named properties saved in PStore, for each ruby version
  • Loading branch information
tom-lord committed Jul 12, 2015
2 parents 0c8f7af + c919600 commit 60e660f
Show file tree
Hide file tree
Showing 12 changed files with 156 additions and 336 deletions.
Binary file added db/unicode_ranges_2.0.pstore
Binary file not shown.
1 change: 1 addition & 0 deletions db/unicode_ranges_2.1.pstore
Binary file added db/unicode_ranges_2.2.pstore
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ def random_example(**config_options)
end

private
def examples_by_method(method)
def examples_by_method(method)
full_examples = RegexpExamples.public_send(
method,
RegexpExamples::Parser.new(source, options).parse
)
RegexpExamples::BackReferenceReplacer.new.substitute_backreferences(full_examples)
end
end
end
end
end
Expand Down
12 changes: 11 additions & 1 deletion lib/regexp-examples.rb
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
Dir[File.dirname(__FILE__) + '/regexp-examples/**/*.rb'].each {|file| require file }
require_relative "regexp-examples/unicode_char_ranges"
require_relative "regexp-examples/backreferences"
require_relative "regexp-examples/chargroup_parser"
require_relative "regexp-examples/constants"
require_relative "regexp-examples/groups"
require_relative "regexp-examples/helpers"
require_relative "regexp-examples/parser"
require_relative "regexp-examples/repeaters"
require_relative "regexp-examples/unicode_char_ranges"
require_relative "regexp-examples/version"
require_relative "core_extensions/regexp/examples"

156 changes: 3 additions & 153 deletions lib/regexp-examples/constants.rb

Large diffs are not rendered by default.

21 changes: 12 additions & 9 deletions lib/regexp-examples/groups.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ module GroupWithIgnoreCase
def result
group_result = super
if ignorecase
group_result
.concat( group_result.map(&:swapcase) )
.uniq
Enumerator.new do |ignorecase_group_result|
group_result.each do |gr|
ignorecase_group_result << gr
ignorecase_group_result << gr.swapcase
end
end.lazy
else
group_result
end
Expand All @@ -39,7 +42,7 @@ def result

module RandomResultBySample
def random_result
result.sample(1)
result.force.sample(1)
end
end

Expand All @@ -51,7 +54,7 @@ def initialize(char, ignorecase)
@ignorecase = ignorecase
end
def result
[GroupResult.new(@char)]
[GroupResult.new(@char)].lazy
end
end

Expand All @@ -62,7 +65,7 @@ def result
class PlaceHolderGroup
include RandomResultBySample
def result
[GroupResult.new('')]
[GroupResult.new('')].lazy
end
end

Expand All @@ -75,7 +78,7 @@ def initialize(chars, ignorecase)
end

def result
@chars.map do |result|
@chars.lazy.map do |result|
GroupResult.new(result)
end
end
Expand All @@ -91,7 +94,7 @@ def initialize(multiline)

def result
chars = multiline ? CharSets::Any : CharSets::AnyNoNewLine
chars.map do |result|
chars.lazy.map do |result|
GroupResult.new(result)
end
end
Expand Down Expand Up @@ -160,7 +163,7 @@ def initialize(id)
end

def result
[ GroupResult.new("__#{@id}__") ]
[ GroupResult.new("__#{@id}__") ].lazy
end
end

Expand Down
2 changes: 1 addition & 1 deletion lib/regexp-examples/repeaters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def initialize(group)
end

def result
group_results = group.result[0 .. RegexpExamples.MaxGroupResults-1]
group_results = group.result.first(RegexpExamples.MaxGroupResults)
results = []
min_repeats.upto(max_repeats) do |repeats|
if repeats.zero?
Expand Down
47 changes: 47 additions & 0 deletions lib/regexp-examples/unicode_char_ranges.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
require 'pstore'

module RegexpExamples
class UnicodeCharRanges

# These values were generated by: scripts/unicode_lister.rb
# Note: Only the first 128 results are listed, for performance.
# Also, some groups seem to have no matches (weird!)
# (Don't care about ruby micro version number)
STORE_FILENAME = "unicode_ranges_#{RUBY_VERSION[0..2]}.pstore"

attr_reader :range_store

def initialize(location="db/#{STORE_FILENAME}")
@range_store = PStore.new(location)
end

def get(key)
range_store.transaction(true) do
ranges_to_unicode(range_store[key])
end
end

alias_method :[], :get

private

# TODO: Document example input/output of this method
# It's pretty simple, but this code is a little confusing!!
def ranges_to_unicode(ranges)
result = []
ranges.each do |range|
if range.is_a? Fixnum # Small hack to increase data compression
result << hex_to_unicode(range.to_s(16))
else
range.each { |num| result << hex_to_unicode(num.to_s(16)) }
end
end
result
end

def hex_to_unicode(hex)
eval("?\\u{#{hex}}")
end
end
end

2 changes: 1 addition & 1 deletion lib/regexp-examples/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module RegexpExamples
VERSION = '1.1.0'
VERSION = '1.1.1'
end
179 changes: 32 additions & 147 deletions scripts/unicode_lister.rb
Original file line number Diff line number Diff line change
@@ -1,159 +1,38 @@
require 'pstore'
require_relative '../lib/regexp-examples/unicode_char_ranges'
# A script to generate lists of all unicode characters
# that match all named group/character properties regexps.
# For use in e.g. /\p{Arabic}/.examples

# To (re-)generate this list, simply run this file!
# > ruby scripts/unicode_lister.rb
OutputFilename = 'unicode_result'

# Taken from ruby documentation:
# http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties
NamedGroups = %w(
Alnum
Alpha
Blank
Cntrl
Digit
Graph
Lower
Print
Punct
Space
Upper
XDigit
Word
ASCII
Any
Assigned
Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word ASCII Any Assigned

L
Ll
Lm
Lo
Lt
Lu
M
Mn
Mc
Me
N
Nd
Nl
No
P
Pc
Pd
Ps
Pe
Pi
Pf
Po
S
Sm
Sc
Sk
So
Z
Zs
Zl
Zp
C
Cc
Cf
Cn
Co
Cs
L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl Zp C Cc Cf Cn Co Cs

Arabic
Armenian
Balinese
Bengali
Bopomofo
Braille
Buginese
Buhid
Canadian_Aboriginal
Carian
Cham
Cherokee
Common
Coptic
Cuneiform
Cypriot
Cyrillic
Deseret
Devanagari
Ethiopic
Georgian
Glagolitic
Gothic
Greek
Gujarati
Gurmukhi
Han
Hangul
Hanunoo
Hebrew
Hiragana
Inherited
Kannada
Katakana
Kayah_Li
Kharoshthi
Khmer
Lao
Latin
Lepcha
Limbu
Linear_B
Lycian
Lydian
Malayalam
Mongolian
Myanmar
New_Tai_Lue
Nko
Ogham
Ol_Chiki
Old_Italic
Old_Persian
Oriya
Osmanya
Phags_Pa
Phoenician
Rejang
Runic
Saurashtra
Shavian
Sinhala
Sundanese
Syloti_Nagri
Syriac
Tagalog
Tagbanwa
Tai_Le
Tamil
Telugu
Thaana
Thai
Tibetan
Tifinagh
Ugaritic
Vai
Yi
Arabic Armenian Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal Carian Cham Cherokee
Common Coptic Cuneiform Cypriot Cyrillic Deseret Devanagari Ethiopic Georgian Glagolitic Gothic Greek
Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana Inherited Kannada Katakana Kayah_Li Kharoshthi Khmer
Lao Latin Lepcha Limbu Linear_B Lycian Lydian Malayalam Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki
Old_Italic Old_Persian Oriya Osmanya Phags_Pa Phoenician Rejang Runic Saurashtra Shavian Sinhala Sundanese
Syloti_Nagri Syriac Tagalog Tagbanwa Tai_Le Tamil Telugu Thaana Thai Tibetan Tifinagh Ugaritic Vai Yi
)

# Note: For some reason, a character encoding-related exception gets raised
# when I do `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` in the range: 55296..57343
# This means my calculation is MISSING results in the range: 55296..65535
# However, for the sake of performance, I'm also being "lazy" and only calculating/saving
# the first 128 matches anyway!
# If anyone ever cares about this (I doubt it), I'll look into fixing/improving it.
# Note: For the range 55296..57343, these are reserved values that are not legal
# unicode characters.
# I.e. a character encoding-related exception gets raised when you do:
# `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")`
# TODO: Add a link to somewhere that explains this better.

# Example input: [1, 2, 3, 4, 6, 7, 12, 14] (Array)
# Example output: "1..4, 6..7, 12, 14" (String)
# "Compresses" the values in an array by using ranges.
# Example input: [1, 2, 3, 4, 6, 7, 12, 14]
# Example output: [1..4, 6..7, 12, 14]
def calculate_ranges(matching_codes)
return "" if matching_codes.empty?
return [] if matching_codes.empty?
first = matching_codes.shift
matching_codes.inject([first..first]) do |r,x|
if r.last.last.succ != x
Expand All @@ -162,19 +41,25 @@ def calculate_ranges(matching_codes)
r[0..-2] << (r.last.first..x) # Update last range
end
end
.map { |range| range.size == 1 ? range.first : range}
.join(", ")
.map { |range| range.size == 1 ? range.first : range} # Replace `int..int` with `int`
end

count = 0
File.open(OutputFilename, 'w') do |f|
filename = "db/#{RegexpExamples::UnicodeCharRanges::STORE_FILENAME}"
store = PStore.new(filename)
store.transaction do
NamedGroups.each do |name|
count += 1
matching_codes = (0..55295).lazy.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }.first(128)
f.puts "'#{name.downcase}' => ranges_to_unicode(#{calculate_ranges(matching_codes)}),"
count += 1
# Only generating first 128 matches, for performance...
# (I have tried this with generating ALL examples, and it makes the ruby gem
# painfully slow and bloated... Especially the test suite.)
matching_codes = [(0..55295), (57344..65535)].map(&:to_a).flatten.lazy
.select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") }
.first(128)
store[name.downcase] = calculate_ranges(matching_codes)
puts "(#{count}/#{NamedGroups.length}) Finished property: #{name}"
end
puts "*"*50
puts "Finished! Result stored in: #{OutputFilename}"
puts "Finished! Result stored in: #{filename}"
end

Loading

0 comments on commit 60e660f

Please sign in to comment.