Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update tokenizers to 0.15.0 #55

Merged
merged 4 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 63 additions & 6 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

87 changes: 49 additions & 38 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,63 @@
description = "Tokenizers";

inputs = {
fenix = {
url = "github:nix-community/fenix";
inputs.nixpkgs.follows = "nixpkgs";
};
nixpkgs.url = "nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
};

outputs = { self, nixpkgs, flake-utils }:
outputs = {
self,
nixpkgs,
flake-utils,
fenix,
}:
flake-utils.lib.eachSystem [
flake-utils.lib.system.x86_64-linux
flake-utils.lib.system.x86_64-darwin
flake-utils.lib.system.aarch64-darwin
flake-utils.lib.system.aarch64-linux
]
(system:
let pkgs = import nixpkgs { inherit system; };
in
{
devShell = pkgs.mkShell {
buildInputs = with pkgs; [
act
binutils
cargo
cc
clang
clippy
elixir_1_14
erlang
gdb
gcc
libiconv
openssl
pkg-config
rustc
] ++ lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Foundation
darwin.apple_sdk.frameworks.Carbon
darwin.apple_sdk.frameworks.AppKit
];
shellHook = ''
mkdir -p .nix-mix
mkdir -p .nix-hex
export MIX_HOME=$PWD/.nix-mix
export HEX_HOME=$PWD/.nix-hex
export PATH=$MIX_HOME/bin:$PATH
export PATH=$HEX_HOME/bin:$PATH
export PATH=$MIX_HOME/escripts:$PATH
export ERL_AFLAGS="-kernel shell_history enabled"
'';
};
});
(system: let
pkgs = import nixpkgs {inherit system;};
in {
devShell = pkgs.mkShell {
buildInputs = with pkgs;
[
act
binutils
clang
elixir_1_15
(fenix.packages."${system}".complete.withComponents [
"cargo"
"clippy"
"rust-src"
"rustc"
"rustfmt"
])
gcc
libiconv
openssl
pkg-config
]
++ lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Foundation
darwin.apple_sdk.frameworks.Carbon
darwin.apple_sdk.frameworks.AppKit
];
shellHook = ''
mkdir -p .nix-mix
mkdir -p .nix-hex
export MIX_HOME=$PWD/.nix-mix
export HEX_HOME=$PWD/.nix-hex
export PATH=$MIX_HOME/bin:$PATH
export PATH=$HEX_HOME/bin:$PATH
export PATH=$MIX_HOME/escripts:$PATH
export ERL_AFLAGS="-kernel shell_history enabled"
'';
};
});
}
2 changes: 2 additions & 0 deletions lib/tokenizers/model/bpe.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ defmodule Tokenizers.Model.BPE do
@typedoc """
Options for model initialisation.

* `:byte_fallback`- whether to use the byte fallback trick

* `:cache_capacity` - the number of words that the BPE cache can
contain. The cache allows to speed-up the process by keeping
the result of the merge operations for a number of words.
Expand Down
2 changes: 2 additions & 0 deletions lib/tokenizers/model/unigram.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ defmodule Tokenizers.Model.Unigram do
@typedoc """
Options for model initialisation.

* `:byte_fallback`- whether to use the byte fallback trick
* `:unk_id`- the unknown token id to be used by the model

"""
@type options() :: [
byte_fallback: boolean(),
unk_id: float()
]

Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Tokenizers.MixProject do
use Mix.Project

@source_url "https://github.com/elixir-nx/tokenizers"
@version "0.4.0"
@version "0.5.0-dev"

def project do
[
Expand Down
Loading