From 7eda3d344c8c46f0ab87635fa97eea30a228fb4b Mon Sep 17 00:00:00 2001 From: Michael Ruoss Date: Sat, 30 Sep 2023 21:17:36 +0200 Subject: [PATCH 1/5] Add support for regular expressions in Tokenizers.PreTokenizer.split/3 --- lib/tokenizers/pre_tokenizer.ex | 18 +++++++++++++----- native/ex_tokenizers/src/pre_tokenizers.rs | 7 ++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/lib/tokenizers/pre_tokenizer.ex b/lib/tokenizers/pre_tokenizer.ex index b9f4540..0351284 100644 --- a/lib/tokenizers/pre_tokenizer.ex +++ b/lib/tokenizers/pre_tokenizer.ex @@ -137,17 +137,25 @@ defmodule Tokenizers.PreTokenizer do Creates a Split pre-tokenizer. Versatile pre-tokenizer that splits on provided pattern and according - to provided behavior. The pattern can be inverted if necessary. + to provided behavior. The pattern can be a string or a regular expression. + The pattern can be inverted if necessary. ## Options * `:invert` - whether to invert the split or not. Defaults to `false` """ - @spec split(String.t(), split_delimiter_behaviour(), keyword()) :: t() - defdelegate split(pattern, behavior, opts \\ []), - to: Tokenizers.Native, - as: :pre_tokenizers_split + @spec split(String.t() | Regex.t(), split_delimiter_behaviour(), keyword()) :: t() + def split(pattern, behavior, opts \\ []) + + def split(pattern, behavior, opts) when is_binary(pattern) do + Tokenizers.Native.pre_tokenizers_split(pattern, behavior, opts) + end + + def split(%Regex{} = pattern, behavior, opts) do + split(Regex.source(pattern), behavior, Keyword.put(opts, :use_regex, true)) + end + @doc """ Creates a Punctuation pre-tokenizer. diff --git a/native/ex_tokenizers/src/pre_tokenizers.rs b/native/ex_tokenizers/src/pre_tokenizers.rs index 0e4f875..fce8b24 100644 --- a/native/ex_tokenizers/src/pre_tokenizers.rs +++ b/native/ex_tokenizers/src/pre_tokenizers.rs @@ -3,6 +3,7 @@ use crate::{new_info, ExTokenizersError}; use rustler::NifTaggedEnum; use serde::{Deserialize, Serialize}; use tokenizers::PreTokenizer; +use tokenizers::pre_tokenizers::split::SplitPattern; use tokenizers::{processors::byte_level::ByteLevel, PreTokenizedString, PreTokenizerWrapper}; pub struct ExTokenizersPreTokenizerRef(pub PreTokenizerWrapper); @@ -239,6 +240,7 @@ impl From for tokenizers::SplitDelimiterBehavior { #[derive(NifTaggedEnum)] pub enum SplitOption { Invert(bool), + UseRegex(bool) } #[rustler::nif] @@ -251,14 +253,17 @@ pub fn pre_tokenizers_split( invert: bool, } let mut opts = Opts { invert: false }; + let mut final_pattern = SplitPattern::String(String::from("")); for option in options { match option { SplitOption::Invert(invert) => opts.invert = invert, + SplitOption::UseRegex(true) => final_pattern = SplitPattern::Regex(pattern.to_owned()), + SplitOption::UseRegex(false) => final_pattern = SplitPattern::String(pattern.to_owned()), } } Ok(ExTokenizersPreTokenizer::new( - tokenizers::pre_tokenizers::split::Split::new(pattern, behavior.into(), opts.invert) + tokenizers::pre_tokenizers::split::Split::new(final_pattern, behavior.into(), opts.invert) .map_err(|_| rustler::Error::BadArg)?, )) } From b3bfd26b0ce3f44888a6f00e40e7dee236caa54c Mon Sep 17 00:00:00 2001 From: Michael Ruoss Date: Sat, 30 Sep 2023 21:57:23 +0200 Subject: [PATCH 2/5] add a test to verify split accepts regular expressions --- test/tokenizers/pre_tokenizer_test.exs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/tokenizers/pre_tokenizer_test.exs b/test/tokenizers/pre_tokenizer_test.exs index d50b51e..3ed2e66 100644 --- a/test/tokenizers/pre_tokenizer_test.exs +++ b/test/tokenizers/pre_tokenizer_test.exs @@ -22,6 +22,11 @@ defmodule Tokenizers.PreTokenizerTest do assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split(" ", :removed, invert: true) end + + test "accepts regular expressions" do + assert %Tokenizers.PreTokenizer{} = + Tokenizers.PreTokenizer.split(~r/.*/, :removed) + end end describe "WhitespaceSplit pretokenizer" do From 171ebc16c5400a614c0148934a88927902d45671 Mon Sep 17 00:00:00 2001 From: Michael Ruoss Date: Sun, 8 Oct 2023 13:33:13 +0200 Subject: [PATCH 3/5] use tuples to pass the pattern as enum --- lib/tokenizers/pre_tokenizer.ex | 15 +++++++++------ native/ex_tokenizers/src/pre_tokenizers.rs | 17 ++++++++++++----- test/tokenizers/pre_tokenizer_test.exs | 14 +++++++++----- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/lib/tokenizers/pre_tokenizer.ex b/lib/tokenizers/pre_tokenizer.ex index 0351284..5e0cbf8 100644 --- a/lib/tokenizers/pre_tokenizer.ex +++ b/lib/tokenizers/pre_tokenizer.ex @@ -137,23 +137,26 @@ defmodule Tokenizers.PreTokenizer do Creates a Split pre-tokenizer. Versatile pre-tokenizer that splits on provided pattern and according - to provided behavior. The pattern can be a string or a regular expression. - The pattern can be inverted if necessary. + to provided behavior. The pattern should be in the form of a tuple + `{:string, pattern}` or `{:regex, pattern}` depending on whether the tuple is + a regular expression or not. For convenience, a simple binary is accepted + as well in which case the pattern is converted to the tuple + `{:string, pattern}`. ## Options * `:invert` - whether to invert the split or not. Defaults to `false` """ - @spec split(String.t() | Regex.t(), split_delimiter_behaviour(), keyword()) :: t() + @spec split(String.t() | {:string, String.t()}| {:regex, String.t()} , split_delimiter_behaviour(), keyword()) :: t() def split(pattern, behavior, opts \\ []) def split(pattern, behavior, opts) when is_binary(pattern) do - Tokenizers.Native.pre_tokenizers_split(pattern, behavior, opts) + split({:string, pattern}, behavior, opts) end - def split(%Regex{} = pattern, behavior, opts) do - split(Regex.source(pattern), behavior, Keyword.put(opts, :use_regex, true)) + def split(pattern, behavior, opts) do + Tokenizers.Native.pre_tokenizers_split(pattern, behavior, opts) end diff --git a/native/ex_tokenizers/src/pre_tokenizers.rs b/native/ex_tokenizers/src/pre_tokenizers.rs index fce8b24..0874a3d 100644 --- a/native/ex_tokenizers/src/pre_tokenizers.rs +++ b/native/ex_tokenizers/src/pre_tokenizers.rs @@ -240,12 +240,17 @@ impl From for tokenizers::SplitDelimiterBehavior { #[derive(NifTaggedEnum)] pub enum SplitOption { Invert(bool), - UseRegex(bool) +} + +#[derive(NifTaggedEnum)] +pub enum LocalSplitPattern { + String(String), + Regex(String) } #[rustler::nif] pub fn pre_tokenizers_split( - pattern: String, + pattern: LocalSplitPattern, behavior: SplitDelimiterBehavior, options: Vec, ) -> Result { @@ -253,12 +258,14 @@ pub fn pre_tokenizers_split( invert: bool, } let mut opts = Opts { invert: false }; - let mut final_pattern = SplitPattern::String(String::from("")); + let final_pattern = match pattern { + LocalSplitPattern::String(pattern) => SplitPattern::String(pattern), + LocalSplitPattern::Regex(pattern) => SplitPattern::Regex(pattern), + }; + for option in options { match option { SplitOption::Invert(invert) => opts.invert = invert, - SplitOption::UseRegex(true) => final_pattern = SplitPattern::Regex(pattern.to_owned()), - SplitOption::UseRegex(false) => final_pattern = SplitPattern::String(pattern.to_owned()), } } diff --git a/test/tokenizers/pre_tokenizer_test.exs b/test/tokenizers/pre_tokenizer_test.exs index 3ed2e66..48fed44 100644 --- a/test/tokenizers/pre_tokenizer_test.exs +++ b/test/tokenizers/pre_tokenizer_test.exs @@ -15,17 +15,21 @@ defmodule Tokenizers.PreTokenizerTest do describe "Split pretokenizer" do test "accepts no parameters" do - assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split(" ", :removed) + assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split({:string, " "}, :removed) end - test "accepts options" do + test "accepts regular expressions" do assert %Tokenizers.PreTokenizer{} = - Tokenizers.PreTokenizer.split(" ", :removed, invert: true) + Tokenizers.PreTokenizer.split({:regex, ~S/.*/}, :removed) end - test "accepts regular expressions" do + test "accepts binaries" do + assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split(" ", :removed) + end + + test "accepts options" do assert %Tokenizers.PreTokenizer{} = - Tokenizers.PreTokenizer.split(~r/.*/, :removed) + Tokenizers.PreTokenizer.split(" ", :removed, invert: true) end end From 11dc09daa6812ba1753f71c46c2a7e11157c53a4 Mon Sep 17 00:00:00 2001 From: Michael Ruoss Date: Sun, 8 Oct 2023 22:10:17 +0200 Subject: [PATCH 4/5] implement regex split pre-tokenizer as separate function --- lib/tokenizers/pre_tokenizer.ex | 41 +++++++++++++++++--------- test/tokenizers/pre_tokenizer_test.exs | 15 ++++++---- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/lib/tokenizers/pre_tokenizer.ex b/lib/tokenizers/pre_tokenizer.ex index 5e0cbf8..091f39f 100644 --- a/lib/tokenizers/pre_tokenizer.ex +++ b/lib/tokenizers/pre_tokenizer.ex @@ -134,31 +134,44 @@ defmodule Tokenizers.PreTokenizer do | :contiguous @doc """ - Creates a Split pre-tokenizer. + Creates a Split pre-tokenizer using a string as split pattern. Versatile pre-tokenizer that splits on provided pattern and according - to provided behavior. The pattern should be in the form of a tuple - `{:string, pattern}` or `{:regex, pattern}` depending on whether the tuple is - a regular expression or not. For convenience, a simple binary is accepted - as well in which case the pattern is converted to the tuple - `{:string, pattern}`. + to provided behavior. ## Options * `:invert` - whether to invert the split or not. Defaults to `false` """ - @spec split(String.t() | {:string, String.t()}| {:regex, String.t()} , split_delimiter_behaviour(), keyword()) :: t() - def split(pattern, behavior, opts \\ []) - - def split(pattern, behavior, opts) when is_binary(pattern) do - split({:string, pattern}, behavior, opts) + @spec split(String.t(), split_delimiter_behaviour(), keyword()) :: t() + def split(pattern, behavior, opts \\ []) when is_binary(pattern) do + Tokenizers.Native.pre_tokenizers_split({:string, pattern}, behavior, opts) end - def split(pattern, behavior, opts) do - Tokenizers.Native.pre_tokenizers_split(pattern, behavior, opts) - end + @doc ~S""" + Creates a Split pre-tokenizer using a regular expression as split pattern. + + Versatile pre-tokenizer that splits on provided regex pattern and according + to provided behavior. + + The `pattern` should be a string representing a regular expression + according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma). + + ## Options + * `:invert` - whether to invert the split or not. Defaults to `false` + + ## Example + + iex> Tokenizers.PreTokenizer.split_regex(~S(\?\d{2}\?), :removed) + #Tokenizers.PreTokenizer<[pre_tokenizer_type: "Split"]> + + """ + @spec split_regex(String.t(), split_delimiter_behaviour(), keyword()) :: t() + def split_regex(pattern, behavior, opts \\ []) when is_binary(pattern) do + Tokenizers.Native.pre_tokenizers_split({:regex, pattern}, behavior, opts) + end @doc """ Creates a Punctuation pre-tokenizer. diff --git a/test/tokenizers/pre_tokenizer_test.exs b/test/tokenizers/pre_tokenizer_test.exs index 48fed44..990537a 100644 --- a/test/tokenizers/pre_tokenizer_test.exs +++ b/test/tokenizers/pre_tokenizer_test.exs @@ -15,21 +15,24 @@ defmodule Tokenizers.PreTokenizerTest do describe "Split pretokenizer" do test "accepts no parameters" do - assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split({:string, " "}, :removed) + assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split(" ", :removed) end - test "accepts regular expressions" do + test "accepts options" do assert %Tokenizers.PreTokenizer{} = - Tokenizers.PreTokenizer.split({:regex, ~S/.*/}, :removed) + Tokenizers.PreTokenizer.split(" ", :removed, invert: true) end + end - test "accepts binaries" do - assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.split(" ", :removed) + describe "Regex split pretokenizer" do + test "accepts regular expressions" do + assert %Tokenizers.PreTokenizer{} = + Tokenizers.PreTokenizer.split_regex(".*", :removed) end test "accepts options" do assert %Tokenizers.PreTokenizer{} = - Tokenizers.PreTokenizer.split(" ", :removed, invert: true) + Tokenizers.PreTokenizer.split_regex(".*", :removed, invert: true) end end From e6287bfdc9b73c919e3bf23190837afe4b2ff393 Mon Sep 17 00:00:00 2001 From: Michael Ruoss Date: Tue, 10 Oct 2023 17:31:25 +0200 Subject: [PATCH 5/5] fix the formatting in pre_tokenizers.rs --- native/ex_tokenizers/src/pre_tokenizers.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/native/ex_tokenizers/src/pre_tokenizers.rs b/native/ex_tokenizers/src/pre_tokenizers.rs index 0874a3d..062adf8 100644 --- a/native/ex_tokenizers/src/pre_tokenizers.rs +++ b/native/ex_tokenizers/src/pre_tokenizers.rs @@ -2,8 +2,8 @@ use crate::util::Info; use crate::{new_info, ExTokenizersError}; use rustler::NifTaggedEnum; use serde::{Deserialize, Serialize}; -use tokenizers::PreTokenizer; use tokenizers::pre_tokenizers::split::SplitPattern; +use tokenizers::PreTokenizer; use tokenizers::{processors::byte_level::ByteLevel, PreTokenizedString, PreTokenizerWrapper}; pub struct ExTokenizersPreTokenizerRef(pub PreTokenizerWrapper); @@ -245,7 +245,7 @@ pub enum SplitOption { #[derive(NifTaggedEnum)] pub enum LocalSplitPattern { String(String), - Regex(String) + Regex(String), } #[rustler::nif]