Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update tokenizer dependency to new minor and chain changes #62

Merged
merged 3 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/tokenizers/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ defmodule Tokenizers.Native do
def normalizers_replace(_pattern, _content), do: err()
def normalizers_nmt(), do: err()
def normalizers_precompiled(_data), do: err()
def normalizers_byte_level(), do: err()
def normalizers_byte_level_alphabet(), do: err()

# PreTokenizers
def pre_tokenizers_pre_tokenize(_pre_tokenizer, _input), do: err()
Expand Down
87 changes: 50 additions & 37 deletions lib/tokenizers/normalizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ defmodule Tokenizers.Normalizer do
@spec normalize(t(), String.t()) :: {:ok, String.t()}
defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize

# Normalizer entities. Following the order in https://docs.rs/tokenizers/0.20.0/src/tokenizers/normalizers/mod.rs.html#24

@doc """
Takes care of normalizing raw text before giving it to a BERT model.

Expand Down Expand Up @@ -49,30 +51,6 @@ defmodule Tokenizers.Normalizer do
to: Tokenizers.Native,
as: :normalizers_bert_normalizer

@doc """
Creates a NFD Unicode normalizer.
"""
@spec nfd :: t()
defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd

@doc """
Creates a NFKD Unicode normalizer.
"""
@spec nfkd :: t()
defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd

@doc """
Creates a NFC Unicode normalizer.
"""
@spec nfc :: t()
defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc

@doc """
Creates a NFKC Unicode normalizer.
"""
@spec nfkc :: t()
defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc

@doc """
Creates a Strip normalizer.

Expand All @@ -89,12 +67,6 @@ defmodule Tokenizers.Normalizer do
@spec strip(keyword()) :: t()
defdelegate strip(opts \\ []), to: Tokenizers.Native, as: :normalizers_strip

@doc """
Creates a Prepend normalizer.
"""
@spec prepend(prepend :: String.t()) :: t()
defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend

@doc """
Creates a Strip Accent normalizer.

Expand All @@ -104,6 +76,30 @@ defmodule Tokenizers.Normalizer do
@spec strip_accents :: t()
defdelegate strip_accents(), to: Tokenizers.Native, as: :normalizers_strip_accents

@doc """
Creates a NFC Unicode normalizer.
"""
@spec nfc :: t()
defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc

@doc """
Creates a NFD Unicode normalizer.
"""
@spec nfd :: t()
defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd

@doc """
Creates a NFKC Unicode normalizer.
"""
@spec nfkc :: t()
defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc

@doc """
Creates a NFKD Unicode normalizer.
"""
@spec nfkd :: t()
defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd

@doc """
Composes multiple normalizers that will run in the provided order.
"""
Expand All @@ -116,6 +112,20 @@ defmodule Tokenizers.Normalizer do
@spec lowercase :: t()
defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase

@doc """
Creates a Nmt normalizer.
"""
@spec nmt :: t()
defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt

@doc """
Precompiled normalizer.

Don’t use manually it is used for compatibility with SentencePiece.
"""
@spec precompiled(binary()) :: {:ok, t()} | {:error, any()}
defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled

@doc """
Replaces a custom `search` string with the given `content`.
"""
Expand All @@ -136,18 +146,21 @@ defmodule Tokenizers.Normalizer do
end

@doc """
Creates a Nmt normalizer.
Creates a Prepend normalizer.
"""
@spec nmt :: t()
defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt
@spec prepend(prepend :: String.t()) :: t()
defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend

@doc """
Precompiled normalizer.
Created ByteLevel normalizer.
"""
@spec byte_level :: t()
defdelegate byte_level(), to: Tokenizers.Native, as: :normalizers_byte_level

Don’t use manually it is used for compatibility with SentencePiece.
@doc """
Gets ByteLevel normalizer's alphabet.
"""
@spec precompiled(binary()) :: {:ok, t()} | {:error, any()}
defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled
defdelegate byte_level_alphabet(), to: Tokenizers.Native, as: :normalizers_byte_level_alphabet
end

defimpl Inspect, for: Tokenizers.Normalizer do
Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Tokenizers.MixProject do
use Mix.Project

@source_url "https://github.com/elixir-nx/tokenizers"
@version "0.5.0-dev"
@version "0.6.0-dev"

def project do
[
Expand Down
8 changes: 4 additions & 4 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
%{
"castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"},
"castore": {:hex, :castore, "1.0.9", "5cc77474afadf02c7c017823f460a17daa7908e991b0cc917febc90e466a375c", [:mix], [], "hexpm", "5ea956504f1ba6f2b4eb707061d8e17870de2bee95fb59d512872c2ef06925e7"},
"earmark_parser": {:hex, :earmark_parser, "1.4.41", "ab34711c9dc6212dda44fcd20ecb87ac3f3fce6f0ca2f28d4a00e4154f8cd599", [:mix], [], "hexpm", "a81a04c7e34b6617c2792e291b5a2e57ab316365c2644ddc553bb9ed863ebefa"},
"ex_doc": {:hex, :ex_doc, "0.34.2", "13eedf3844ccdce25cfd837b99bea9ad92c4e511233199440488d217c92571e8", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "5ce5f16b41208a50106afed3de6a2ed34f4acfd65715b82a0b84b49d995f95c1"},
"finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"},
"finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"},
"hpax": {:hex, :hpax, "1.0.0", "28dcf54509fe2152a3d040e4e3df5b265dcb6cb532029ecbacf4ce52caea3fd2", [:mix], [], "hexpm", "7f1314731d711e2ca5fdc7fd361296593fc2542570b3105595bb0bc6d0fad601"},
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
"makeup": {:hex, :makeup, "1.1.2", "9ba8837913bdf757787e71c1581c21f9d2455f4dd04cfca785c70bbfff1a76a3", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cce1566b81fbcbd21eca8ffe808f33b221f9eee2cbc7a1706fc3da9ff18e6cac"},
Expand All @@ -15,7 +15,7 @@
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
"req": {:hex, :req, "0.5.6", "8fe1eead4a085510fe3d51ad854ca8f20a622aae46e97b302f499dfb84f726ac", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "cfaa8e720945d46654853de39d368f40362c2641c4b2153c886418914b372185"},
"rustler": {:hex, :rustler, "0.34.0", "e9a73ee419fc296a10e49b415a2eb87a88c9217aa0275ec9f383d37eed290c1c", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:req, "~> 0.5", [hex: :req, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "1d0c7449482b459513003230c0e2422b0252245776fe6fd6e41cb2b11bd8e628"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.6.2", "d2218ba08a43fa331957f30481d00b666664d7e3861431b02bd3f4f30eec8e5b", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "b9048eaed8d7d14a53f758c91865cc616608a438d2595f621f6a4b32a5511709"},
"telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.8.2", "5f25cbe220a8fac3e7ad62e6f950fcdca5a5a5f8501835d2823e8c74bf4268d5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "63d1bd5f8e23096d1ff851839923162096364bac8656a4a3c00d1fff8e83ee0a"},
"telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
"toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"},
}
Loading
Loading