diff --git a/lib/tokenizers/native.ex b/lib/tokenizers/native.ex index 319e106..a20b33f 100644 --- a/lib/tokenizers/native.ex +++ b/lib/tokenizers/native.ex @@ -100,6 +100,8 @@ defmodule Tokenizers.Native do def normalizers_replace(_pattern, _content), do: err() def normalizers_nmt(), do: err() def normalizers_precompiled(_data), do: err() + def normalizers_byte_level(), do: err() + def normalizers_byte_level_alphabet(), do: err() # PreTokenizers def pre_tokenizers_pre_tokenize(_pre_tokenizer, _input), do: err() diff --git a/lib/tokenizers/normalizer.ex b/lib/tokenizers/normalizer.ex index 49bddcc..4d988c3 100644 --- a/lib/tokenizers/normalizer.ex +++ b/lib/tokenizers/normalizer.ex @@ -22,6 +22,8 @@ defmodule Tokenizers.Normalizer do @spec normalize(t(), String.t()) :: {:ok, String.t()} defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize + # Normalizer entities. Following the order in https://docs.rs/tokenizers/0.20.0/src/tokenizers/normalizers/mod.rs.html#24 + @doc """ Takes care of normalizing raw text before giving it to a BERT model. @@ -49,30 +51,6 @@ defmodule Tokenizers.Normalizer do to: Tokenizers.Native, as: :normalizers_bert_normalizer - @doc """ - Creates a NFD Unicode normalizer. - """ - @spec nfd :: t() - defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd - - @doc """ - Creates a NFKD Unicode normalizer. - """ - @spec nfkd :: t() - defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd - - @doc """ - Creates a NFC Unicode normalizer. - """ - @spec nfc :: t() - defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc - - @doc """ - Creates a NFKC Unicode normalizer. - """ - @spec nfkc :: t() - defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc - @doc """ Creates a Strip normalizer. @@ -89,12 +67,6 @@ defmodule Tokenizers.Normalizer do @spec strip(keyword()) :: t() defdelegate strip(opts \\ []), to: Tokenizers.Native, as: :normalizers_strip - @doc """ - Creates a Prepend normalizer. - """ - @spec prepend(prepend :: String.t()) :: t() - defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend - @doc """ Creates a Strip Accent normalizer. @@ -104,6 +76,30 @@ defmodule Tokenizers.Normalizer do @spec strip_accents :: t() defdelegate strip_accents(), to: Tokenizers.Native, as: :normalizers_strip_accents + @doc """ + Creates a NFC Unicode normalizer. + """ + @spec nfc :: t() + defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc + + @doc """ + Creates a NFD Unicode normalizer. + """ + @spec nfd :: t() + defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd + + @doc """ + Creates a NFKC Unicode normalizer. + """ + @spec nfkc :: t() + defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc + + @doc """ + Creates a NFKD Unicode normalizer. + """ + @spec nfkd :: t() + defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd + @doc """ Composes multiple normalizers that will run in the provided order. """ @@ -116,6 +112,20 @@ defmodule Tokenizers.Normalizer do @spec lowercase :: t() defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase + @doc """ + Creates a Nmt normalizer. + """ + @spec nmt :: t() + defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt + + @doc """ + Precompiled normalizer. + + Don’t use manually it is used for compatibility with SentencePiece. + """ + @spec precompiled(binary()) :: {:ok, t()} | {:error, any()} + defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled + @doc """ Replaces a custom `search` string with the given `content`. """ @@ -136,18 +146,21 @@ defmodule Tokenizers.Normalizer do end @doc """ - Creates a Nmt normalizer. + Creates a Prepend normalizer. """ - @spec nmt :: t() - defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt + @spec prepend(prepend :: String.t()) :: t() + defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend @doc """ - Precompiled normalizer. + Created ByteLevel normalizer. + """ + @spec byte_level :: t() + defdelegate byte_level(), to: Tokenizers.Native, as: :normalizers_byte_level - Don’t use manually it is used for compatibility with SentencePiece. + @doc """ + Gets ByteLevel normalizer's alphabet. """ - @spec precompiled(binary()) :: {:ok, t()} | {:error, any()} - defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled + defdelegate byte_level_alphabet(), to: Tokenizers.Native, as: :normalizers_byte_level_alphabet end defimpl Inspect, for: Tokenizers.Normalizer do diff --git a/mix.exs b/mix.exs index b02b6db..dd0b67c 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule Tokenizers.MixProject do use Mix.Project @source_url "https://github.com/elixir-nx/tokenizers" - @version "0.5.0-dev" + @version "0.6.0-dev" def project do [ diff --git a/mix.lock b/mix.lock index ff40907..e700ffd 100644 --- a/mix.lock +++ b/mix.lock @@ -1,8 +1,8 @@ %{ - "castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"}, + "castore": {:hex, :castore, "1.0.9", "5cc77474afadf02c7c017823f460a17daa7908e991b0cc917febc90e466a375c", [:mix], [], "hexpm", "5ea956504f1ba6f2b4eb707061d8e17870de2bee95fb59d512872c2ef06925e7"}, "earmark_parser": {:hex, :earmark_parser, "1.4.41", "ab34711c9dc6212dda44fcd20ecb87ac3f3fce6f0ca2f28d4a00e4154f8cd599", [:mix], [], "hexpm", "a81a04c7e34b6617c2792e291b5a2e57ab316365c2644ddc553bb9ed863ebefa"}, "ex_doc": {:hex, :ex_doc, "0.34.2", "13eedf3844ccdce25cfd837b99bea9ad92c4e511233199440488d217c92571e8", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "5ce5f16b41208a50106afed3de6a2ed34f4acfd65715b82a0b84b49d995f95c1"}, - "finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"}, + "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, "hpax": {:hex, :hpax, "1.0.0", "28dcf54509fe2152a3d040e4e3df5b265dcb6cb532029ecbacf4ce52caea3fd2", [:mix], [], "hexpm", "7f1314731d711e2ca5fdc7fd361296593fc2542570b3105595bb0bc6d0fad601"}, "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, "makeup": {:hex, :makeup, "1.1.2", "9ba8837913bdf757787e71c1581c21f9d2455f4dd04cfca785c70bbfff1a76a3", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cce1566b81fbcbd21eca8ffe808f33b221f9eee2cbc7a1706fc3da9ff18e6cac"}, @@ -15,7 +15,7 @@ "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, "req": {:hex, :req, "0.5.6", "8fe1eead4a085510fe3d51ad854ca8f20a622aae46e97b302f499dfb84f726ac", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "cfaa8e720945d46654853de39d368f40362c2641c4b2153c886418914b372185"}, "rustler": {:hex, :rustler, "0.34.0", "e9a73ee419fc296a10e49b415a2eb87a88c9217aa0275ec9f383d37eed290c1c", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:req, "~> 0.5", [hex: :req, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "1d0c7449482b459513003230c0e2422b0252245776fe6fd6e41cb2b11bd8e628"}, - "rustler_precompiled": {:hex, :rustler_precompiled, "0.6.2", "d2218ba08a43fa331957f30481d00b666664d7e3861431b02bd3f4f30eec8e5b", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "b9048eaed8d7d14a53f758c91865cc616608a438d2595f621f6a4b32a5511709"}, - "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.2", "5f25cbe220a8fac3e7ad62e6f950fcdca5a5a5f8501835d2823e8c74bf4268d5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "63d1bd5f8e23096d1ff851839923162096364bac8656a4a3c00d1fff8e83ee0a"}, + "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, "toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"}, } diff --git a/native/ex_tokenizers/.cargo/config b/native/ex_tokenizers/.cargo/config.toml similarity index 100% rename from native/ex_tokenizers/.cargo/config rename to native/ex_tokenizers/.cargo/config.toml diff --git a/native/ex_tokenizers/Cargo.lock b/native/ex_tokenizers/Cargo.lock index f672a85..23bc089 100644 --- a/native/ex_tokenizers/Cargo.lock +++ b/native/ex_tokenizers/Cargo.lock @@ -4,24 +4,18 @@ version = 3 [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "anyhow" -version = "1.0.72" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854" - -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "base64" @@ -35,13 +29,19 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cc" -version = "1.0.82" +version = "1.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01" +checksum = "812acba72f0a070b003d3697490d2b55b837230ae7c6c6497f05cc2ddbb8d938" dependencies = [ - "libc", + "shlex", ] [[package]] @@ -52,42 +52,34 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "darling" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ "darling_core", "darling_macro", @@ -95,9 +87,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", @@ -109,9 +101,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.8" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", @@ -120,18 +112,18 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" dependencies = [ "darling", "proc-macro2", @@ -141,9 +133,9 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" dependencies = [ "derive_builder_core", "syn", @@ -151,9 +143,9 @@ dependencies = [ [[package]] name = "either" -version = "1.9.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "esaxx-rs" @@ -166,7 +158,7 @@ dependencies = [ [[package]] name = "ex_tokenizers" -version = "0.13.0" +version = "0.1.0" dependencies = [ "anyhow", "rustler", @@ -183,9 +175,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -230,27 +222,27 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "log" -version = "0.4.19" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "macro_rules_attribute" @@ -270,18 +262,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" [[package]] name = "memchr" -version = "2.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" - -[[package]] -name = "memoffset" -version = "0.9.0" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "minimal-lexical" @@ -291,9 +274,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "monostate" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a20fffcd8ca4c69d31e036a71abc400147b41f90895df4edcb36497a1f8af8bf" +checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e" dependencies = [ "monostate-impl", "serde", @@ -301,9 +284,9 @@ dependencies = [ [[package]] name = "monostate-impl" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf307cbbbd777a9c10cec88ddafee572b3484caad5cce0c9236523c3803105a6" +checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" dependencies = [ "proc-macro2", "quote", @@ -322,9 +305,12 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" +dependencies = [ + "portable-atomic", +] [[package]] name = "onig" @@ -350,36 +336,45 @@ dependencies = [ [[package]] name = "paste" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" + +[[package]] +name = "portable-atomic" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" -version = "1.0.81" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -447,9 +442,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.4" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", @@ -459,9 +454,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -470,9 +465,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustler" @@ -510,30 +505,24 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" -version = "1.0.183" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.183" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", @@ -542,20 +531,27 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.104" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "spm_precompiled" @@ -571,15 +567,15 @@ dependencies = [ [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.60" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", @@ -588,18 +584,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.50" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2" +checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.50" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" +checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", @@ -608,9 +604,9 @@ dependencies = [ [[package]] name = "tokenizers" -version = "0.19.1" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd" +checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70" dependencies = [ "aho-corasick", "derive_builder", @@ -639,9 +635,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization-alignments" @@ -654,9 +650,9 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode_categories" @@ -684,3 +680,24 @@ name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/native/ex_tokenizers/Cargo.toml b/native/ex_tokenizers/Cargo.toml index e22ed27..61322ac 100644 --- a/native/ex_tokenizers/Cargo.toml +++ b/native/ex_tokenizers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ex_tokenizers" -version = "0.13.0" +version = "0.1.0" authors = [] edition = "2021" @@ -13,5 +13,5 @@ crate-type = ["cdylib"] anyhow = "1" rustler = "0.34.0" thiserror = "1" -tokenizers = { version = "0.19.1", default-features = false, features = ["onig", "esaxx_fast"]} +tokenizers = { version = "0.20.0", default-features = false, features = ["onig", "esaxx_fast"]} serde = { version = "1.0", features = [ "rc", "derive" ] } diff --git a/native/ex_tokenizers/src/added_token.rs b/native/ex_tokenizers/src/added_token.rs index 7613f54..cf007b9 100644 --- a/native/ex_tokenizers/src/added_token.rs +++ b/native/ex_tokenizers/src/added_token.rs @@ -1,10 +1,13 @@ use crate::{new_info, util::Info}; -use rustler::{NifTaggedEnum, NifUntaggedEnum}; +use rustler::{NifTaggedEnum, NifUntaggedEnum, Resource}; use serde::{Deserialize, Serialize}; use tokenizers::AddedToken; pub struct ExTokenizersAddedTokenRef(pub AddedToken); +#[rustler::resource_impl] +impl Resource for ExTokenizersAddedTokenRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.AddedToken"] pub struct ExTokenizersAddedToken { diff --git a/native/ex_tokenizers/src/decoders.rs b/native/ex_tokenizers/src/decoders.rs index ce8f663..9527b00 100644 --- a/native/ex_tokenizers/src/decoders.rs +++ b/native/ex_tokenizers/src/decoders.rs @@ -6,6 +6,9 @@ use crate::{new_info, util::Info, ExTokenizersError}; pub struct ExTokenizersDecoderRef(pub DecoderWrapper); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersDecoderRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.Decoder"] pub struct ExTokenizersDecoder { diff --git a/native/ex_tokenizers/src/encoding.rs b/native/ex_tokenizers/src/encoding.rs index 36500d4..6592418 100644 --- a/native/ex_tokenizers/src/encoding.rs +++ b/native/ex_tokenizers/src/encoding.rs @@ -5,6 +5,9 @@ use crate::util::Direction; pub struct ExTokenizersEncodingRef(pub Encoding); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersEncodingRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.Encoding"] pub struct ExTokenizersEncoding { diff --git a/native/ex_tokenizers/src/lib.rs b/native/ex_tokenizers/src/lib.rs index d0dc003..bf4a2a3 100644 --- a/native/ex_tokenizers/src/lib.rs +++ b/native/ex_tokenizers/src/lib.rs @@ -10,29 +10,11 @@ mod tokenizer; mod trainers; mod util; -use added_token::*; -use decoders::*; -use encoding::*; -use models::*; -use normalizers::*; -use post_processors::*; -use pre_tokenizers::*; use rustler::{Env, Term}; -use tokenizer::*; -use trainers::*; pub use error::ExTokenizersError; -fn on_load(env: Env, _info: Term) -> bool { - rustler::resource!(ExTokenizersAddedTokenRef, env); - rustler::resource!(ExTokenizersDecoderRef, env); - rustler::resource!(ExTokenizersTokenizerRef, env); - rustler::resource!(ExTokenizersEncodingRef, env); - rustler::resource!(ExTokenizersTrainerRef, env); - rustler::resource!(ExTokenizersModelRef, env); - rustler::resource!(ExTokenizersNormalizerRef, env); - rustler::resource!(ExTokenizersPostProcessorRef, env); - rustler::resource!(ExTokenizersPreTokenizerRef, env); +fn on_load(_env: Env, _info: Term) -> bool { true } diff --git a/native/ex_tokenizers/src/models.rs b/native/ex_tokenizers/src/models.rs index 6a6bc66..29b261a 100644 --- a/native/ex_tokenizers/src/models.rs +++ b/native/ex_tokenizers/src/models.rs @@ -16,6 +16,9 @@ use crate::{new_info, util::Info}; pub struct ExTokenizersModelRef(pub RwLock); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersModelRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.Model"] pub struct ExTokenizersModel { diff --git a/native/ex_tokenizers/src/normalizers.rs b/native/ex_tokenizers/src/normalizers.rs index c17dade..aff8487 100644 --- a/native/ex_tokenizers/src/normalizers.rs +++ b/native/ex_tokenizers/src/normalizers.rs @@ -2,11 +2,15 @@ use crate::{new_info, util::Info, ExTokenizersError}; use rustler::NifTaggedEnum; use serde::{Deserialize, Serialize}; use tokenizers::{ - normalizers::replace::ReplacePattern, NormalizedString, Normalizer, NormalizerWrapper, + normalizers::{replace::ReplacePattern, ByteLevel}, + NormalizedString, Normalizer, NormalizerWrapper, }; pub struct ExTokenizersNormalizerRef(pub NormalizerWrapper); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersNormalizerRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.Normalizer"] pub struct ExTokenizersNormalizer { @@ -123,6 +127,9 @@ fn normalizers_info(normalizer: ExTokenizersNormalizer) -> Info { NormalizerWrapper::Prepend(_) => new_info!( normalizer_type: "Prepend" ), + NormalizerWrapper::ByteLevel(_) => new_info!( + normalizer_type: "ByteLevel" + ), } } @@ -277,3 +284,18 @@ pub fn normalizers_precompiled(data: Vec) -> Result ExTokenizersNormalizer { + ExTokenizersNormalizer::new(tokenizers::normalizers::byte_level::ByteLevel) +} + +#[rustler::nif] +pub fn normalizers_byte_level_alphabet() -> Vec { + ByteLevel::alphabet() + .iter() + .map(|c| String::from(*c)) + .collect() +} diff --git a/native/ex_tokenizers/src/post_processors.rs b/native/ex_tokenizers/src/post_processors.rs index 8a8c3e1..84d06c9 100644 --- a/native/ex_tokenizers/src/post_processors.rs +++ b/native/ex_tokenizers/src/post_processors.rs @@ -6,6 +6,9 @@ use crate::{new_info, util::Info}; pub struct ExTokenizersPostProcessorRef(pub PostProcessorWrapper); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersPostProcessorRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.PostProcessor"] pub struct ExTokenizersPostProcessor { diff --git a/native/ex_tokenizers/src/pre_tokenizers.rs b/native/ex_tokenizers/src/pre_tokenizers.rs index aa34bd8..09958b4 100644 --- a/native/ex_tokenizers/src/pre_tokenizers.rs +++ b/native/ex_tokenizers/src/pre_tokenizers.rs @@ -8,6 +8,9 @@ use tokenizers::{processors::byte_level::ByteLevel, PreTokenizedString, PreToken pub struct ExTokenizersPreTokenizerRef(pub PreTokenizerWrapper); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersPreTokenizerRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.PreTokenizer"] pub struct ExTokenizersPreTokenizer { diff --git a/native/ex_tokenizers/src/tokenizer.rs b/native/ex_tokenizers/src/tokenizer.rs index 807dd05..d554578 100644 --- a/native/ex_tokenizers/src/tokenizer.rs +++ b/native/ex_tokenizers/src/tokenizer.rs @@ -31,6 +31,9 @@ type ExTokenizerImpl = TokenizerImpl< pub struct ExTokenizersTokenizerRef(ExTokenizerImpl); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersTokenizerRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.Tokenizer"] pub struct ExTokenizersTokenizer { @@ -194,7 +197,7 @@ pub fn tokenizer_set_normalizer( normalizer: ExTokenizersNormalizer, ) -> ExTokenizersTokenizer { let mut new_tokenizer = tokenizer.resource.0.clone(); - new_tokenizer.with_normalizer(normalizer); + new_tokenizer.with_normalizer(Some(normalizer)); new_tokenizer.into() } @@ -213,7 +216,7 @@ pub fn tokenizer_set_pre_tokenizer( pre_tokenizer: ExTokenizersPreTokenizer, ) -> ExTokenizersTokenizer { let mut new_tokenizer = tokenizer.resource.0.clone(); - new_tokenizer.with_pre_tokenizer(pre_tokenizer); + new_tokenizer.with_pre_tokenizer(Some(pre_tokenizer)); new_tokenizer.into() } @@ -232,7 +235,7 @@ pub fn tokenizer_set_post_processor( post_processor: ExTokenizersPostProcessor, ) -> ExTokenizersTokenizer { let mut new_tokenizer = tokenizer.resource.0.clone(); - new_tokenizer.with_post_processor(post_processor); + new_tokenizer.with_post_processor(Some(post_processor)); new_tokenizer.into() } @@ -248,7 +251,7 @@ pub fn tokenizer_set_decoder( decoder: ExTokenizersDecoder, ) -> ExTokenizersTokenizer { let mut new_tokenizer = tokenizer.resource.0.clone(); - new_tokenizer.with_decoder(decoder); + new_tokenizer.with_decoder(Some(decoder)); new_tokenizer.into() } diff --git a/native/ex_tokenizers/src/trainers.rs b/native/ex_tokenizers/src/trainers.rs index ac8c03d..9e0e87f 100644 --- a/native/ex_tokenizers/src/trainers.rs +++ b/native/ex_tokenizers/src/trainers.rs @@ -19,6 +19,9 @@ use crate::util::Info; pub struct ExTokenizersTrainerRef(pub RwLock); +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizersTrainerRef {} + #[derive(rustler::NifStruct)] #[module = "Tokenizers.Trainer"] pub struct ExTokenizersTrainer { diff --git a/test/tokenizers/normalizer_test.exs b/test/tokenizers/normalizer_test.exs index c1e82d4..395e3b6 100644 --- a/test/tokenizers/normalizer_test.exs +++ b/test/tokenizers/normalizer_test.exs @@ -114,4 +114,21 @@ defmodule Tokenizers.NormalizerTest do {:ok, "Hello"} end end + + describe "ByteLevel" do + test "can be initialized" do + assert %Tokenizers.Normalizer{} = Tokenizers.Normalizer.byte_level() + end + + test "can normalize strings" do + # Test is taken directly from original Rust implementation + assert Tokenizers.Normalizer.byte_level() + |> Tokenizers.Normalizer.normalize("Hello 我今天能为你做什么") == + {:ok, "HelloĠæĪijä»Ĭ天èĥ½ä¸ºä½łåģļä»Ģä¹Ī"} + end + + test "returns alphabet" do + assert length(Tokenizers.Normalizer.byte_level_alphabet()) != 0 + end + end end