Skip to content

Commit

Permalink
Update tokenizers to 0.15.0
Browse files Browse the repository at this point in the history
  • Loading branch information
cigrainger committed Dec 13, 2023
1 parent a8a7464 commit a56bd16
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 90 deletions.
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Tokenizers.MixProject do
use Mix.Project

@source_url "https://github.com/elixir-nx/tokenizers"
@version "0.4.0"
@version "0.5.0-dev"

def project do
[
Expand Down
112 changes: 30 additions & 82 deletions native/ex_tokenizers/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion native/ex_tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ crate-type = ["cdylib"]
anyhow = "1"
rustler = "0.29.1"
thiserror = "1"
tokenizers = { version = "0.13.3", default-features = false, features = ["onig", "esaxx_fast"]}
tokenizers = { version = "0.15.0", default-features = false, features = ["onig", "esaxx_fast"]}
serde = { version = "1.0", features = [ "rc", "derive" ] }
2 changes: 1 addition & 1 deletion native/ex_tokenizers/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ pub fn models_unigram_init(
};

Ok(ExTokenizersModel::new(
tokenizers::models::unigram::Unigram::from(vocab, unk_id)?,
tokenizers::models::unigram::Unigram::from(vocab, unk_id, false)?,
))
}

Expand Down
15 changes: 10 additions & 5 deletions native/ex_tokenizers/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ fn apply_load_options(mut tokenizer: ExTokenizerImpl, options: Vec<LoadOption>)
}

if opts.disable_truncation {
tokenizer.with_truncation(None);
tokenizer.with_truncation(None).unwrap();
}

tokenizer
Expand Down Expand Up @@ -335,14 +335,14 @@ pub fn tokenizer_set_truncation(
TruncationOption::Direction(direction) => truncation.direction = direction.into(),
});
let mut new_tokenizer = tokenizer.resource.0.clone();
new_tokenizer.with_truncation(Some(truncation));
new_tokenizer.with_truncation(Some(truncation)).unwrap();
new_tokenizer.into()
}

#[rustler::nif]
pub fn tokenizer_disable_truncation(tokenizer: ExTokenizersTokenizer) -> ExTokenizersTokenizer {
let mut new_tokenizer = tokenizer.resource.0.clone();
new_tokenizer.with_truncation(None);
new_tokenizer.with_truncation(None).unwrap();
new_tokenizer.into()
}

Expand Down Expand Up @@ -530,7 +530,10 @@ pub fn tokenizer_decode(
}
});

Ok(tokenizer.resource.0.decode(ids, opts.skip_special_tokens)?)
Ok(tokenizer
.resource
.0
.decode(&ids, opts.skip_special_tokens)?)
}

#[rustler::nif(schedule = "DirtyCpu")]
Expand All @@ -551,10 +554,12 @@ pub fn tokenizer_decode_batch(
}
});

let sentence_slices: Vec<&[u32]> = sentences.iter().map(Vec::as_slice).collect();

Ok(tokenizer
.resource
.0
.decode_batch(sentences, opts.skip_special_tokens)?)
.decode_batch(sentence_slices.as_slice(), opts.skip_special_tokens)?)
}

#[rustler::nif]
Expand Down

0 comments on commit a56bd16

Please sign in to comment.