From febea1301025856fd7f79bc19a11e94c4496ac32 Mon Sep 17 00:00:00 2001 From: egcash <85733533+egcash@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:44:55 +0530 Subject: [PATCH 1/3] =?UTF-8?q?Added=20Tokenizer=20for=20Claude=20Models?= =?UTF-8?q?=20=E2=9C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tokencost/costs.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tokencost/costs.py b/tokencost/costs.py index feb931a..bfe0208 100644 --- a/tokencost/costs.py +++ b/tokencost/costs.py @@ -1,8 +1,10 @@ + """ Costs dictionary and utility tool for counting tokens """ import tiktoken +import anthropic from typing import Union, List, Dict from .constants import TOKEN_COSTS from decimal import Decimal @@ -39,6 +41,18 @@ def count_message_tokens(messages: List[Dict[str, str]], model: str) -> int: """ model = model.lower() model = strip_ft_model_name(model) + + if "claude-" in model: + """ + Note that this is only accurate for older models, e.g. `claude-2.1`. + For newer models this can only be used as a _very_ rough estimate, + instead you should rely on the `usage` property in the response for exact counts. + """ + prompt = "".join(message["content"] for message in messages) + client = anthropic.Client() + token_count = client.count_tokens(prompt) + return token_count + try: encoding = tiktoken.encoding_for_model(model) except KeyError: @@ -104,6 +118,16 @@ def count_string_tokens(prompt: str, model: str) -> int: int: The number of tokens in the text string. """ model = model.lower() + if "claude-" in model: + """ + Note that this is only accurate for older models, e.g. `claude-2.1`. + For newer models this can only be used as a _very_ rough estimate, + instead you should rely on the `usage` property in the response for exact counts. + """ + client = anthropic.Client() + token_count = client.count_tokens(prompt) + return token_count + try: encoding = tiktoken.encoding_for_model(model) except KeyError: From d6ea769351bafa15265c94ef919f4d8158c373a5 Mon Sep 17 00:00:00 2001 From: egcash <85733533+egcash@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:47:54 +0530 Subject: [PATCH 2/3] New Dependency: Anthropic --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4085f4b..0c3405b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,8 @@ classifiers = [ ] dependencies = [ "tiktoken>=0.7.0", - "aiohttp>=3.9.3" + "aiohttp>=3.9.3", + "anthropic>=0.34.0" ] [project.optional-dependencies] From e91c68a71b7edfa29204568ab2e0a30318a97edb Mon Sep 17 00:00:00 2001 From: egcash <85733533+egcash@users.noreply.github.com> Date: Thu, 15 Aug 2024 11:08:38 +0530 Subject: [PATCH 3/3] Added Claude token Counter --- tokencost/costs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tokencost/costs.py b/tokencost/costs.py index bfe0208..ce29d0d 100644 --- a/tokencost/costs.py +++ b/tokencost/costs.py @@ -49,9 +49,7 @@ def count_message_tokens(messages: List[Dict[str, str]], model: str) -> int: instead you should rely on the `usage` property in the response for exact counts. """ prompt = "".join(message["content"] for message in messages) - client = anthropic.Client() - token_count = client.count_tokens(prompt) - return token_count + return count_string_tokens(prompt,model) try: encoding = tiktoken.encoding_for_model(model)