Skip to content

Commit

Permalink
Merge pull request #98 from artichoke/unicode-14.0
Browse files Browse the repository at this point in the history
Update case folding rules to Unicode 14.0.0
  • Loading branch information
lopopolo authored Nov 15, 2021
2 parents 7d834df + 77f373e commit 75a4530
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 23 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "focaccia"
version = "1.0.2" # remember to set `html_root_url` in `src/lib.rs`.
version = "1.1.0" # remember to set `html_root_url` in `src/lib.rs`.
authors = ["Ryan Lopopolo <rjl@hyperbo.la>"]
license = "MIT"
edition = "2018"
Expand Down
46 changes: 43 additions & 3 deletions CaseFolding.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# CaseFolding-13.0.0.txt
# Date: 2019-09-08, 23:30:59 GMT
# © 2019 Unicode®, Inc.
# CaseFolding-14.0.0.txt
# Date: 2021-03-08, 19:35:41 GMT
# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
Expand Down Expand Up @@ -1050,6 +1050,7 @@
2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC
2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A
2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI
2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR
2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE
2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE
Expand Down Expand Up @@ -1230,12 +1231,16 @@ A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE
A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A
A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I
A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U
A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O
A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W
A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK
A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK
A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK
A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY
A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY
A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G
A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S
A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S
A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H
AB70; C; 13A0; # CHEROKEE SMALL LETTER A
AB71; C; 13A1; # CHEROKEE SMALL LETTER E
Expand Down Expand Up @@ -1431,6 +1436,41 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA
104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA
104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA
10570; C; 10597; # VITHKUQI CAPITAL LETTER A
10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE
10572; C; 10599; # VITHKUQI CAPITAL LETTER BE
10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE
10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE
10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE
10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE
10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI
10578; C; 1059F; # VITHKUQI CAPITAL LETTER E
10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE
1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA
1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA
1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA
1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I
1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE
10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE
10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA
10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA
10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA
10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME
10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE
10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE
10587; C; 105AE; # VITHKUQI CAPITAL LETTER O
10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE
10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA
1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE
1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE
1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE
1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE
1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE
10590; C; 105B7; # VITHKUQI CAPITAL LETTER U
10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE
10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE
10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y
10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE
10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A
10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA
10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB
Expand Down
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
[![API](https://docs.rs/focaccia/badge.svg)](https://docs.rs/focaccia)
[![API trunk](https://img.shields.io/badge/docs-trunk-blue.svg)](https://artichoke.github.io/focaccia/focaccia/)

Unicode case folding methods for case-insensitive string comparisons. Used to
implement case folding operations on the [`Symbol`] and [`String`] classes in
the Ruby Core implementation in [Artichoke Ruby][artichoke].
Unicode 14.0.0 case folding methods for case-insensitive string comparisons.
Used to implement case folding operations on the [`Symbol`] and [`String`]
classes in the Ruby Core implementation in [Artichoke Ruby][artichoke].

Focaccia supports full, ASCII, and Turkic [Unicode case folding] equality
comparisons. ASCII folding supports determining case-insensitive ordering.
Expand All @@ -32,7 +32,7 @@ Add this to your `Cargo.toml`:

```toml
[dependencies]
focaccia = "1.0"
focaccia = "1.1"
```

Then make case insensitive string comparisons like:
Expand Down Expand Up @@ -113,6 +113,15 @@ All features are enabled by default.
This crate requires at least Rust 1.52.0. This version can be bumped in minor
releases.

## Unicode Version

Focaccia implements Unicode case folding with the Unicode 14.0.0 case folding
ruleset.

Each new release of Unicode may bring updates to the `CaseFolding.txt` which is
the source for the folding mappings in this crate. Updates to the case folding
rules will be accompanied with a minor version bump.

## License

`focaccia` is licensed under the [MIT License](LICENSE) (c) Ryan Lopopolo.
Expand Down
2 changes: 1 addition & 1 deletion Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ namespace :unicode do

desc 'Update Unicode data'
task :update do
open('https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt') do |data|
URI.parse('https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt').open do |data|
File.open('CaseFolding.txt', 'w') do |file|
data.each_line do |line|
file.write(line)
Expand Down
21 changes: 16 additions & 5 deletions src/folding/mapping/lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
// This source is autogenerated. Do not modify it directly.
// To make modfications to this code, see `scripts/gen_case_lookups.rb`.
//
// Last generated on 2021-05-17 14:24:38 UTC.
// Last generated on 2021-11-15 18:32:20 UTC.
//
// Unicode version:
// CaseFolding-13.0.0.txt
// Date: 2019-09-08, 23:30:59 GMT
// © 2019 Unicode®, Inc.
// CaseFolding-14.0.0.txt
// Date: 2021-03-08, 19:35:41 GMT
// © 2021 Unicode®, Inc.

use super::{Mapping, Mode};

Expand Down Expand Up @@ -593,7 +593,7 @@ pub const fn lookup(c: char, mode: Mode) -> Mapping {
_ => Mapping::Single(codepoint),
},
(0x0000, 0x2C) => match c {
'\u{2C00}'..='\u{2C2E}' => Mapping::Single(codepoint.wrapping_add(0x0030)),
'\u{2C00}'..='\u{2C2F}' => Mapping::Single(codepoint.wrapping_add(0x0030)),
'\u{2C60}' => Mapping::Single(0x2C61),
'\u{2C62}' => Mapping::Single(0x026B),
'\u{2C63}' => Mapping::Single(0x1D7D),
Expand Down Expand Up @@ -779,12 +779,16 @@ pub const fn lookup(c: char, mode: Mode) -> Mapping {
'\u{A7BA}' => Mapping::Single(0xA7BB),
'\u{A7BC}' => Mapping::Single(0xA7BD),
'\u{A7BE}' => Mapping::Single(0xA7BF),
'\u{A7C0}' => Mapping::Single(0xA7C1),
'\u{A7C2}' => Mapping::Single(0xA7C3),
'\u{A7C4}' => Mapping::Single(0xA794),
'\u{A7C5}' => Mapping::Single(0x0282),
'\u{A7C6}' => Mapping::Single(0x1D8E),
'\u{A7C7}' => Mapping::Single(0xA7C8),
'\u{A7C9}' => Mapping::Single(0xA7CA),
'\u{A7D0}' => Mapping::Single(0xA7D1),
'\u{A7D6}' => Mapping::Single(0xA7D7),
'\u{A7D8}' => Mapping::Single(0xA7D9),
'\u{A7F5}' => Mapping::Single(0xA7F6),
_ => Mapping::Single(codepoint),
},
Expand Down Expand Up @@ -816,6 +820,13 @@ pub const fn lookup(c: char, mode: Mode) -> Mapping {
'\u{104B0}'..='\u{104D3}' => Mapping::Single(codepoint.wrapping_add(0x0028)),
_ => Mapping::Single(codepoint),
},
(0x0001, 0x05) => match c {
'\u{10570}'..='\u{1057A}' => Mapping::Single(codepoint.wrapping_add(0x0027)),
'\u{1057C}'..='\u{1058A}' => Mapping::Single(codepoint.wrapping_add(0x0027)),
'\u{1058C}'..='\u{10592}' => Mapping::Single(codepoint.wrapping_add(0x0027)),
'\u{10594}'..='\u{10595}' => Mapping::Single(codepoint.wrapping_add(0x0027)),
_ => Mapping::Single(codepoint),
},
(0x0001, 0x0C) => match c {
'\u{10C80}'..='\u{10CB2}' => Mapping::Single(codepoint.wrapping_add(0x0040)),
_ => Mapping::Single(codepoint),
Expand Down
11 changes: 10 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,23 @@
//! - **std** - Enable linking to the [Rust Standard Library]. Enabling this
//! feature adds [`Error`] implementations to error types in this crate.
//!
//! # Unicode Version
//!
//! Focaccia implements Unicode case folding with the Unicode 14.0.0 case folding
//! ruleset.
//!
//! Each new release of Unicode may bring updates to the `CaseFolding.txt` which is
//! the source for the folding mappings in this crate. Updates to the case folding
//! rules will be accompanied with a minor version bump.
//!
//! [Unicode case folding]: https://www.w3.org/International/wiki/Case_folding
//! [`Ordering`]: core::cmp::Ordering
//! [dotted and dotless I]: https://en.wikipedia.org/wiki/Dotted_and_dotless_I
//! [Rust Standard Library]: https://doc.rust-lang.org/stable/std/index.html
//! [`Error`]: https://doc.rust-lang.org/stable/std/error/trait.Error.html
#![no_std]
#![doc(html_root_url = "https://docs.rs/focaccia/1.0.2")]
#![doc(html_root_url = "https://docs.rs/focaccia/1.1.0")]

// Ensure code blocks in README.md compile
#[cfg(doctest)]
Expand Down
48 changes: 44 additions & 4 deletions tests/integration/full_fold_exhaustive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
// This source is autogenerated. Do not modify it directly.
// To make modfications to this code, see `scripts/gen_case_lookups.rb`.
//
// Last generated on 2021-05-17 14:24:38 UTC.
// Last generated on 2021-11-15 18:32:20 UTC.
//
// Unicode version:
// CaseFolding-13.0.0.txt
// Date: 2019-09-08, 23:30:59 GMT
// © 2019 Unicode®, Inc.
// CaseFolding-14.0.0.txt
// Date: 2021-03-08, 19:35:41 GMT
// © 2021 Unicode®, Inc.

use core::cmp::Ordering;
use focaccia::{unicode_full_case_eq, unicode_full_casecmp};
Expand Down Expand Up @@ -977,6 +977,7 @@ fn lookup_naive(c: char, buf: &mut [u8; 4]) -> &str {
'\u{2C2C}' => "\u{2C5C}",
'\u{2C2D}' => "\u{2C5D}",
'\u{2C2E}' => "\u{2C5E}",
'\u{2C2F}' => "\u{2C5F}",
'\u{2C60}' => "\u{2C61}",
'\u{2C62}' => "\u{026B}",
'\u{2C63}' => "\u{1D7D}",
Expand Down Expand Up @@ -1157,12 +1158,16 @@ fn lookup_naive(c: char, buf: &mut [u8; 4]) -> &str {
'\u{A7BA}' => "\u{A7BB}",
'\u{A7BC}' => "\u{A7BD}",
'\u{A7BE}' => "\u{A7BF}",
'\u{A7C0}' => "\u{A7C1}",
'\u{A7C2}' => "\u{A7C3}",
'\u{A7C4}' => "\u{A794}",
'\u{A7C5}' => "\u{0282}",
'\u{A7C6}' => "\u{1D8E}",
'\u{A7C7}' => "\u{A7C8}",
'\u{A7C9}' => "\u{A7CA}",
'\u{A7D0}' => "\u{A7D1}",
'\u{A7D6}' => "\u{A7D7}",
'\u{A7D8}' => "\u{A7D9}",
'\u{A7F5}' => "\u{A7F6}",
'\u{AB70}' => "\u{13A0}",
'\u{AB71}' => "\u{13A1}",
Expand Down Expand Up @@ -1358,6 +1363,41 @@ fn lookup_naive(c: char, buf: &mut [u8; 4]) -> &str {
'\u{104D1}' => "\u{104F9}",
'\u{104D2}' => "\u{104FA}",
'\u{104D3}' => "\u{104FB}",
'\u{10570}' => "\u{10597}",
'\u{10571}' => "\u{10598}",
'\u{10572}' => "\u{10599}",
'\u{10573}' => "\u{1059A}",
'\u{10574}' => "\u{1059B}",
'\u{10575}' => "\u{1059C}",
'\u{10576}' => "\u{1059D}",
'\u{10577}' => "\u{1059E}",
'\u{10578}' => "\u{1059F}",
'\u{10579}' => "\u{105A0}",
'\u{1057A}' => "\u{105A1}",
'\u{1057C}' => "\u{105A3}",
'\u{1057D}' => "\u{105A4}",
'\u{1057E}' => "\u{105A5}",
'\u{1057F}' => "\u{105A6}",
'\u{10580}' => "\u{105A7}",
'\u{10581}' => "\u{105A8}",
'\u{10582}' => "\u{105A9}",
'\u{10583}' => "\u{105AA}",
'\u{10584}' => "\u{105AB}",
'\u{10585}' => "\u{105AC}",
'\u{10586}' => "\u{105AD}",
'\u{10587}' => "\u{105AE}",
'\u{10588}' => "\u{105AF}",
'\u{10589}' => "\u{105B0}",
'\u{1058A}' => "\u{105B1}",
'\u{1058C}' => "\u{105B3}",
'\u{1058D}' => "\u{105B4}",
'\u{1058E}' => "\u{105B5}",
'\u{1058F}' => "\u{105B6}",
'\u{10590}' => "\u{105B7}",
'\u{10591}' => "\u{105B8}",
'\u{10592}' => "\u{105B9}",
'\u{10594}' => "\u{105BB}",
'\u{10595}' => "\u{105BC}",
'\u{10C80}' => "\u{10CC0}",
'\u{10C81}' => "\u{10CC1}",
'\u{10C82}' => "\u{10CC2}",
Expand Down
48 changes: 44 additions & 4 deletions tests/integration/full_turkic_fold_exhaustive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
// This source is autogenerated. Do not modify it directly.
// To make modfications to this code, see `scripts/gen_case_lookups.rb`.
//
// Last generated on 2021-05-17 14:24:38 UTC.
// Last generated on 2021-11-15 18:32:20 UTC.
//
// Unicode version:
// CaseFolding-13.0.0.txt
// Date: 2019-09-08, 23:30:59 GMT
// © 2019 Unicode®, Inc.
// CaseFolding-14.0.0.txt
// Date: 2021-03-08, 19:35:41 GMT
// © 2021 Unicode®, Inc.

use core::cmp::Ordering;
use focaccia::{unicode_full_turkic_case_eq, unicode_full_turkic_casecmp};
Expand Down Expand Up @@ -977,6 +977,7 @@ fn lookup_naive(c: char, buf: &mut [u8; 4]) -> &str {
'\u{2C2C}' => "\u{2C5C}",
'\u{2C2D}' => "\u{2C5D}",
'\u{2C2E}' => "\u{2C5E}",
'\u{2C2F}' => "\u{2C5F}",
'\u{2C60}' => "\u{2C61}",
'\u{2C62}' => "\u{026B}",
'\u{2C63}' => "\u{1D7D}",
Expand Down Expand Up @@ -1157,12 +1158,16 @@ fn lookup_naive(c: char, buf: &mut [u8; 4]) -> &str {
'\u{A7BA}' => "\u{A7BB}",
'\u{A7BC}' => "\u{A7BD}",
'\u{A7BE}' => "\u{A7BF}",
'\u{A7C0}' => "\u{A7C1}",
'\u{A7C2}' => "\u{A7C3}",
'\u{A7C4}' => "\u{A794}",
'\u{A7C5}' => "\u{0282}",
'\u{A7C6}' => "\u{1D8E}",
'\u{A7C7}' => "\u{A7C8}",
'\u{A7C9}' => "\u{A7CA}",
'\u{A7D0}' => "\u{A7D1}",
'\u{A7D6}' => "\u{A7D7}",
'\u{A7D8}' => "\u{A7D9}",
'\u{A7F5}' => "\u{A7F6}",
'\u{AB70}' => "\u{13A0}",
'\u{AB71}' => "\u{13A1}",
Expand Down Expand Up @@ -1358,6 +1363,41 @@ fn lookup_naive(c: char, buf: &mut [u8; 4]) -> &str {
'\u{104D1}' => "\u{104F9}",
'\u{104D2}' => "\u{104FA}",
'\u{104D3}' => "\u{104FB}",
'\u{10570}' => "\u{10597}",
'\u{10571}' => "\u{10598}",
'\u{10572}' => "\u{10599}",
'\u{10573}' => "\u{1059A}",
'\u{10574}' => "\u{1059B}",
'\u{10575}' => "\u{1059C}",
'\u{10576}' => "\u{1059D}",
'\u{10577}' => "\u{1059E}",
'\u{10578}' => "\u{1059F}",
'\u{10579}' => "\u{105A0}",
'\u{1057A}' => "\u{105A1}",
'\u{1057C}' => "\u{105A3}",
'\u{1057D}' => "\u{105A4}",
'\u{1057E}' => "\u{105A5}",
'\u{1057F}' => "\u{105A6}",
'\u{10580}' => "\u{105A7}",
'\u{10581}' => "\u{105A8}",
'\u{10582}' => "\u{105A9}",
'\u{10583}' => "\u{105AA}",
'\u{10584}' => "\u{105AB}",
'\u{10585}' => "\u{105AC}",
'\u{10586}' => "\u{105AD}",
'\u{10587}' => "\u{105AE}",
'\u{10588}' => "\u{105AF}",
'\u{10589}' => "\u{105B0}",
'\u{1058A}' => "\u{105B1}",
'\u{1058C}' => "\u{105B3}",
'\u{1058D}' => "\u{105B4}",
'\u{1058E}' => "\u{105B5}",
'\u{1058F}' => "\u{105B6}",
'\u{10590}' => "\u{105B7}",
'\u{10591}' => "\u{105B8}",
'\u{10592}' => "\u{105B9}",
'\u{10594}' => "\u{105BB}",
'\u{10595}' => "\u{105BC}",
'\u{10C80}' => "\u{10CC0}",
'\u{10C81}' => "\u{10CC1}",
'\u{10C82}' => "\u{10CC2}",
Expand Down

0 comments on commit 75a4530

Please sign in to comment.