From c3f854430876ddd5d318315a0c5fb4c13a563622 Mon Sep 17 00:00:00 2001 From: tom-lord Date: Sat, 18 May 2024 12:17:52 +0100 Subject: [PATCH 1/6] Udate gemspec --- .ruby-version | 2 +- Gemfile | 1 - regexp-examples.gemspec | 7 ++++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.ruby-version b/.ruby-version index 4a36342..ef538c2 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -3.0.0 +3.1.2 diff --git a/Gemfile b/Gemfile index 29b0bc6..553544a 100644 --- a/Gemfile +++ b/Gemfile @@ -3,7 +3,6 @@ source 'https://rubygems.org' group :test do gem 'rspec' gem 'coveralls', require: false - gem 'pry' end # Specify your gem's dependencies in regexp-examples.gemspec diff --git a/regexp-examples.gemspec b/regexp-examples.gemspec index f433c9f..562197f 100644 --- a/regexp-examples.gemspec +++ b/regexp-examples.gemspec @@ -14,9 +14,10 @@ Gem::Specification.new do |s| s.test_files = s.files.grep(/^(test|spec|features)\//) s.require_paths = ['lib'] s.homepage = 'http://rubygems.org/gems/regexp-examples' - s.add_development_dependency 'bundler', '> 1.7' - s.add_development_dependency 'rake', '~> 12.0' - s.add_development_dependency 'pry', '~> 0.12.0' + s.add_dependency 'regexp_property_values', '~> 1.5' + s.add_development_dependency 'bundler', '~> 2.4' + s.add_development_dependency 'rake', '~> 13.0' + s.add_development_dependency 'pry' s.add_development_dependency 'warning', '~> 0.10.0' s.license = 'MIT' s.required_ruby_version = '>= 2.4.0' From aa435863343f05b37e2b274f78f74beff75aa860 Mon Sep 17 00:00:00 2001 From: tom-lord Date: Sat, 18 May 2024 13:43:53 +0100 Subject: [PATCH 2/6] Replace pstore implementation with gem --- .../parse_after_backslash_group_helper.rb | 4 +++- spec/regexp-examples_spec.rb | 24 ++----------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb b/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb index b011c8c..886f377 100644 --- a/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb +++ b/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb @@ -1,3 +1,5 @@ +require 'regexp_property_values' + module RegexpExamples # A collection of related helper methods, utilised by the `Parser` class module ParseAfterBackslashGroupHelper @@ -97,7 +99,7 @@ def parse_backslash_named_property(p_negation, caret_negation, property_name) # Beware of double negatives! E.g. /\P{^Space}/ is_negative = (p_negation == 'P') ^ (caret_negation == '^') CharGroup.new( - negate_if(CharSets::NamedPropertyCharMap[property_name.downcase], is_negative), + negate_if(RegexpPropertyValues[property_name.downcase].matched_characters, is_negative), @ignorecase ) end diff --git a/spec/regexp-examples_spec.rb b/spec/regexp-examples_spec.rb index 3c49b5c..e419fce 100644 --- a/spec/regexp-examples_spec.rb +++ b/spec/regexp-examples_spec.rb @@ -190,20 +190,8 @@ def self.examples_are_empty(*regexps) /\P{Ll}/, # Negation syntax type 2 /\P{^Ll}/ # Double negation!! (Should cancel out) ) - # An exhaustive set of tests for all named properties!!! This is useful - # for verifying the PStore contains correct values for all ruby versions - %w[ - Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit - Word ASCII Any Assigned L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd - Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl Zp C Cc Cf Cn Co Arabic Armenian - Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal - Cham Cherokee Common Coptic Cyrillic Devanagari Ethiopic Georgian - Glagolitic Greek Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana - Inherited Kannada Katakana Kayah_Li Khmer Lao Latin Lepcha Limbu Malayalam - Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki Oriya Phags_Pa Rejang - Runic Saurashtra Sinhala Sundanese Syloti_Nagri Syriac Tagalog Tagbanwa - Tai_Le Tamil Telugu Thaana Thai Tibetan Tifinagh Vai Yi - ].each do |property| + + RegexpPropertyValues.all_for_current_ruby.map(&:identifier).each do |property| it "examples for /\p{#{property}}/" do regexp_examples = /\p{#{property}}/.examples(max_group_results: 99_999) expect(regexp_examples) @@ -214,14 +202,6 @@ def self.examples_are_empty(*regexps) expect(regexp_examples.join('')).to match(/\A\p{#{property}}+\z/) end end - - # The following seem to genuinely have no matching examples (!!??!!?!) - %w[ - Cs Carian Cuneiform Cypriot Deseret Gothic Kharoshthi Linear_B Lycian - Lydian Old_Italic Old_Persian Osmanya Phoenician Shavian Ugaritic - ].each do |property| - examples_are_empty(/\p{#{property}}/) - end end context 'for control characters' do From 3a13bc0eaa7eba5e589dff9317c6cd75146feac8 Mon Sep 17 00:00:00 2001 From: tom-lord Date: Mon, 20 May 2024 18:56:07 +0100 Subject: [PATCH 3/6] Passing specs --- .../parser_helpers/charset_negation_helper.rb | 2 +- .../parse_after_backslash_group_helper.rb | 10 ++++++-- spec/regexp-examples_spec.rb | 23 ++++++++++++------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/lib/regexp-examples/parser_helpers/charset_negation_helper.rb b/lib/regexp-examples/parser_helpers/charset_negation_helper.rb index a243b93..e142f2f 100644 --- a/lib/regexp-examples/parser_helpers/charset_negation_helper.rb +++ b/lib/regexp-examples/parser_helpers/charset_negation_helper.rb @@ -2,7 +2,7 @@ module RegexpExamples module CharsetNegationHelper def negate_if(charset, is_negative) - is_negative ? (CharSets::Any.dup - charset) : charset + is_negative ? (CharSets::Any.dup - charset.to_a) : charset end end end diff --git a/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb b/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb index 886f377..a1cfb52 100644 --- a/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb +++ b/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb @@ -93,13 +93,19 @@ def parse_backslash_unicode_sequence(full_hex_sequence) end def parse_backslash_named_property(p_negation, caret_negation, property_name) - @current_position += (caret_negation.length + # 0 or 1, of '^' is present + @current_position += (caret_negation.length + # 0 or 1, if '^' is present property_name.length + 2) # Length of opening and closing brackets (always 2) # Beware of double negatives! E.g. /\P{^Space}/ is_negative = (p_negation == 'P') ^ (caret_negation == '^') CharGroup.new( - negate_if(RegexpPropertyValues[property_name.downcase].matched_characters, is_negative), + negate_if( + RegexpPropertyValues[property_name.downcase] + .matched_codepoints + .lazy + .filter_map { |cp| cp.chr('utf-8') unless cp.between?(0xD800, 0xDFFF) }, + is_negative + ), @ignorecase ) end diff --git a/spec/regexp-examples_spec.rb b/spec/regexp-examples_spec.rb index e419fce..a64c953 100644 --- a/spec/regexp-examples_spec.rb +++ b/spec/regexp-examples_spec.rb @@ -191,15 +191,22 @@ def self.examples_are_empty(*regexps) /\P{^Ll}/ # Double negation!! (Should cancel out) ) + expected_empty_properties = %w[surrogate inlowsurrogates inhighsurrogates inhighprivateusesurrogates] + RegexpPropertyValues.all_for_current_ruby.map(&:identifier).each do |property| - it "examples for /\p{#{property}}/" do - regexp_examples = /\p{#{property}}/.examples(max_group_results: 99_999) - expect(regexp_examples) - .not_to be_empty, - "No examples were generated for regexp: /\p{#{property}}/" - # Just do one big check, for test system performance (~30% faster) - # (Otherwise, we're doing up to 128 checks on 123 properties!!!) - expect(regexp_examples.join('')).to match(/\A\p{#{property}}+\z/) + if(expected_empty_properties).include?(property) + examples_are_empty(/\p{#{property}}/) + else + it "examples for /\p{#{property}}/" do + regexp_examples = /\p{#{property}}/.examples(max_group_results: 99_999) + + expect(regexp_examples) + .not_to be_empty, + "No examples were generated for regexp: /\p{#{property}}/" + # Just do one big check, for test system performance (~30% faster) + # (Otherwise, we're doing up to 128 checks on 123 properties!!!) + expect(regexp_examples.join('')).to match(/\A\p{#{property}}+\z/) + end end end end From 62d93c69c96d29f4766234707106deebad17ed0a Mon Sep 17 00:00:00 2001 From: tom-lord Date: Mon, 20 May 2024 19:22:47 +0100 Subject: [PATCH 4/6] Delete code --- lib/regexp-examples.rb | 2 - lib/regexp-examples/char_sets.rb | 2 - lib/regexp-examples/unicode_char_ranges.rb | 59 ------------------- scripts/unicode_lister.rb | 68 ---------------------- 4 files changed, 131 deletions(-) delete mode 100644 lib/regexp-examples/unicode_char_ranges.rb delete mode 100644 scripts/unicode_lister.rb diff --git a/lib/regexp-examples.rb b/lib/regexp-examples.rb index 0e91d60..26700ec 100644 --- a/lib/regexp-examples.rb +++ b/lib/regexp-examples.rb @@ -1,4 +1,3 @@ -require_relative 'regexp-examples/unicode_char_ranges' require_relative 'regexp-examples/chargroup_parser' require_relative 'regexp-examples/config' require_relative 'regexp-examples/char_sets' @@ -8,6 +7,5 @@ require_relative 'regexp-examples/helpers' require_relative 'regexp-examples/parser' require_relative 'regexp-examples/repeaters' -require_relative 'regexp-examples/unicode_char_ranges' require_relative 'regexp-examples/version' require_relative 'core_extensions/regexp/examples' diff --git a/lib/regexp-examples/char_sets.rb b/lib/regexp-examples/char_sets.rb index 825693e..cc5402f 100644 --- a/lib/regexp-examples/char_sets.rb +++ b/lib/regexp-examples/char_sets.rb @@ -53,7 +53,5 @@ module CharSets 'word' => Word, 'ascii' => Any }.freeze - - NamedPropertyCharMap = UnicodeCharRanges.instance end.freeze end diff --git a/lib/regexp-examples/unicode_char_ranges.rb b/lib/regexp-examples/unicode_char_ranges.rb deleted file mode 100644 index ad18ce7..0000000 --- a/lib/regexp-examples/unicode_char_ranges.rb +++ /dev/null @@ -1,59 +0,0 @@ -require 'pstore' -require 'singleton' - -module RegexpExamples - # Interface to the retrieve the character sets that match a regex named property. - # E.g. `/\p{Alpha}/` - # These matching values are stored, compressed, in a PStore. They are specific to - # the ruby minor version. - class UnicodeCharRanges - include Singleton - # These values were generated by: scripts/unicode_lister.rb - # Note: Only the first 128 results are listed, for performance. - # Also, some groups seem to have no matches (weird!) - STORE_FILENAME = "unicode_ranges_#{RbConfig::CONFIG['UNICODE_VERSION']}.pstore".freeze - - attr_reader :range_store - - def initialize - @range_store = PStore.new(unicode_ranges_file) - end - - def get(key) - range_store.transaction(true) do - ranges_to_unicode(range_store[key]) - end - end - - alias [] get - - private - - # The method is written like this to future-proof it a little, - # i.e. the gem won't completely break for a new ruby version release - def unicode_ranges_file - db_path = File.join(__dir__, '../../db') - Dir["#{db_path}/*.pstore"].sort.select do |file| - file <= "#{db_path}/#{STORE_FILENAME}" - end.last - end - - # TODO: Document example input/output of this method - # It's pretty simple, but this code is a little confusing!! - def ranges_to_unicode(ranges) - result = [] - ranges.each do |range| - if range.is_a? Integer # Small hack to increase data compression - result << hex_to_unicode(range.to_s(16)) - else - range.each { |num| result << hex_to_unicode(num.to_s(16)) } - end - end - result - end - - def hex_to_unicode(hex) - [hex.to_i(16)].pack('U') - end - end -end diff --git a/scripts/unicode_lister.rb b/scripts/unicode_lister.rb deleted file mode 100644 index 253ace9..0000000 --- a/scripts/unicode_lister.rb +++ /dev/null @@ -1,68 +0,0 @@ -require 'pstore' -require_relative '../lib/regexp-examples/unicode_char_ranges' -# A script to generate lists of all unicode characters -# that match all named group/character properties regexps. -# For use in e.g. /\p{Arabic}/.examples - -# To (re-)generate this list, simply run this file! -# > ruby scripts/unicode_lister.rb - -# Taken from ruby documentation: -# http://ruby-doc.org//core-2.2.0/Regexp.html#class-Regexp-label-Character+Properties -NAMED_GROUPS = %w( - Alnum Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word ASCII - Any Assigned - - L Ll Lm Lo Lt Lu M Mn Mc Me N Nd Nl No P Pc Pd Ps Pe Pi Pf Po S Sm Sc Sk So Z Zs Zl - Zp C Cc Cf Cn Co Cs - - Arabic Armenian Balinese Bengali Bopomofo Braille Buginese Buhid Canadian_Aboriginal - Carian Cham Cherokee Common Coptic Cuneiform Cypriot Cyrillic Deseret Devanagari - Ethiopic Georgian Glagolitic Gothic Greek Gujarati Gurmukhi Han Hangul Hanunoo Hebrew - Hiragana Inherited Kannada Katakana Kayah_Li Kharoshthi Khmer Lao Latin Lepcha Limbu - Linear_B Lycian Lydian Malayalam Mongolian Myanmar New_Tai_Lue Nko Ogham Ol_Chiki - Old_Italic Old_Persian Oriya Osmanya Phags_Pa Phoenician Rejang Runic Saurashtra - Shavian Sinhala Sundanese Syloti_Nagri Syriac Tagalog Tagbanwa Tai_Le Tamil Telugu - Thaana Thai Tibetan Tifinagh Ugaritic Vai Yi -) - -# Note: For the range 55296..57343, these are reserved values that are not legal -# unicode characters. -# I.e. a character encoding-related exception gets raised when you do: -# `/regex/ =~ eval("?\\u{#{x.to_s(16)}}")` -# TODO: Add a link to somewhere that explains this better. - -# "Compresses" the values in an array by using ranges. -# Example input: [1, 2, 3, 4, 6, 7, 12, 14] -# Example output: [1..4, 6..7, 12, 14] -def calculate_ranges(matching_codes) - return [] if matching_codes.empty? - first = matching_codes.shift - matching_codes.inject([first..first]) do |r, x| - if r.last.last.succ != x - r << (x..x) # Start new range - else - r[0..-2] << (r.last.first..x) # Update last range - end - end - .map { |range| range.size == 1 ? range.first : range } # Replace `int..int` with `int` -end - -count = 0 -filename = "./db/#{RegexpExamples::UnicodeCharRanges::STORE_FILENAME}" -store = PStore.new(filename) -store.transaction do - NAMED_GROUPS.each do |name| - count += 1 - # Only generating first 128 matches, for performance... - # (I have tried this with generating ALL examples, and it makes the ruby gem - # painfully slow and bloated... Especially the test suite.) - matching_codes = [(0..55_295), (57_344..65_535)].map(&:to_a).flatten.lazy - .select { |x| /\p{#{name}}/ =~ eval("?\\u{#{x.to_s(16)}}") } - .first(128) - store[name.downcase] = calculate_ranges(matching_codes) - puts "(#{count}/#{NAMED_GROUPS.length}) Finished property: #{name}" - end - puts '*' * 50 - puts "Finished! Result stored in: #{filename}" -end From 47c8c96562ef107088bf3f2fcc7df2c4c6c39270 Mon Sep 17 00:00:00 2001 From: tom-lord Date: Mon, 20 May 2024 19:28:25 +0100 Subject: [PATCH 5/6] Remove now-redandant downcase --- .../parser_helpers/parse_after_backslash_group_helper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb b/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb index a1cfb52..e5e947e 100644 --- a/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb +++ b/lib/regexp-examples/parser_helpers/parse_after_backslash_group_helper.rb @@ -100,7 +100,7 @@ def parse_backslash_named_property(p_negation, caret_negation, property_name) is_negative = (p_negation == 'P') ^ (caret_negation == '^') CharGroup.new( negate_if( - RegexpPropertyValues[property_name.downcase] + RegexpPropertyValues[property_name] .matched_codepoints .lazy .filter_map { |cp| cp.chr('utf-8') unless cp.between?(0xD800, 0xDFFF) }, From 3d89ff247ac5a60f19b2b25f1eab8da54a22e2d6 Mon Sep 17 00:00:00 2001 From: tom-lord Date: Mon, 20 May 2024 19:28:31 +0100 Subject: [PATCH 6/6] Update comment --- spec/regexp-examples_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/regexp-examples_spec.rb b/spec/regexp-examples_spec.rb index a64c953..f64560c 100644 --- a/spec/regexp-examples_spec.rb +++ b/spec/regexp-examples_spec.rb @@ -204,7 +204,7 @@ def self.examples_are_empty(*regexps) .not_to be_empty, "No examples were generated for regexp: /\p{#{property}}/" # Just do one big check, for test system performance (~30% faster) - # (Otherwise, we're doing up to 128 checks on 123 properties!!!) + # (Otherwise, we're potentially doing 99999 checks on 123 properties!!!) expect(regexp_examples.join('')).to match(/\A\p{#{property}}+\z/) end end