Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for regular expressions in Tokenizers.PreTokenizer.split/3 #54

Merged
merged 5 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions lib/tokenizers/pre_tokenizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,44 @@ defmodule Tokenizers.PreTokenizer do
| :contiguous

@doc """
Creates a Split pre-tokenizer.
Creates a Split pre-tokenizer using a string as split pattern.

Versatile pre-tokenizer that splits on provided pattern and according
to provided behavior. The pattern can be inverted if necessary.
to provided behavior.

## Options

* `:invert` - whether to invert the split or not. Defaults to `false`

"""
@spec split(String.t(), split_delimiter_behaviour(), keyword()) :: t()
defdelegate split(pattern, behavior, opts \\ []),
to: Tokenizers.Native,
as: :pre_tokenizers_split
def split(pattern, behavior, opts \\ []) when is_binary(pattern) do
Tokenizers.Native.pre_tokenizers_split({:string, pattern}, behavior, opts)
end

@doc ~S"""
Creates a Split pre-tokenizer using a regular expression as split pattern.

Versatile pre-tokenizer that splits on provided regex pattern and according
to provided behavior.

The `pattern` should be a string representing a regular expression
according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).

## Options

* `:invert` - whether to invert the split or not. Defaults to `false`

## Example

iex> Tokenizers.PreTokenizer.split_regex(~S(\?\d{2}\?), :removed)
#Tokenizers.PreTokenizer<[pre_tokenizer_type: "Split"]>

"""
@spec split_regex(String.t(), split_delimiter_behaviour(), keyword()) :: t()
def split_regex(pattern, behavior, opts \\ []) when is_binary(pattern) do
Tokenizers.Native.pre_tokenizers_split({:regex, pattern}, behavior, opts)
end

@doc """
Creates a Punctuation pre-tokenizer.
Expand Down
16 changes: 14 additions & 2 deletions native/ex_tokenizers/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::util::Info;
use crate::{new_info, ExTokenizersError};
use rustler::NifTaggedEnum;
use serde::{Deserialize, Serialize};
use tokenizers::pre_tokenizers::split::SplitPattern;
use tokenizers::PreTokenizer;
use tokenizers::{processors::byte_level::ByteLevel, PreTokenizedString, PreTokenizerWrapper};

Expand Down Expand Up @@ -241,24 +242,35 @@ pub enum SplitOption {
Invert(bool),
}

#[derive(NifTaggedEnum)]
pub enum LocalSplitPattern {
String(String),
Regex(String),
}

#[rustler::nif]
pub fn pre_tokenizers_split(
pattern: String,
pattern: LocalSplitPattern,
behavior: SplitDelimiterBehavior,
options: Vec<SplitOption>,
) -> Result<ExTokenizersPreTokenizer, rustler::Error> {
struct Opts {
invert: bool,
}
let mut opts = Opts { invert: false };
let final_pattern = match pattern {
LocalSplitPattern::String(pattern) => SplitPattern::String(pattern),
LocalSplitPattern::Regex(pattern) => SplitPattern::Regex(pattern),
};

for option in options {
match option {
SplitOption::Invert(invert) => opts.invert = invert,
}
}

Ok(ExTokenizersPreTokenizer::new(
tokenizers::pre_tokenizers::split::Split::new(pattern, behavior.into(), opts.invert)
tokenizers::pre_tokenizers::split::Split::new(final_pattern, behavior.into(), opts.invert)
.map_err(|_| rustler::Error::BadArg)?,
))
}
Expand Down
12 changes: 12 additions & 0 deletions test/tokenizers/pre_tokenizer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ defmodule Tokenizers.PreTokenizerTest do
end
end

describe "Regex split pretokenizer" do
test "accepts regular expressions" do
assert %Tokenizers.PreTokenizer{} =
Tokenizers.PreTokenizer.split_regex(".*", :removed)
end

test "accepts options" do
assert %Tokenizers.PreTokenizer{} =
Tokenizers.PreTokenizer.split_regex(".*", :removed, invert: true)
end
end

describe "WhitespaceSplit pretokenizer" do
test "accepts no parameters" do
assert %Tokenizers.PreTokenizer{} = Tokenizers.PreTokenizer.whitespace_split()
Expand Down