Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate gpt-4-vision-preview model #630

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
# TTS_PRICES=0.015,0.030
# BOT_LANGUAGE=en
# ENABLE_VISION_FOLLOW_UP_QUESTIONS="true"
# VISION_MODEL="gpt-4-vision-preview"
# VISION_MODEL="gpt-4o"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
| `SHOW_USAGE` | Whether to show OpenAI token usage information after each response | `false` |
| `STREAM` | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1 | `true` |
| `MAX_TOKENS` | Upper bound on how many tokens the ChatGPT API will return | `1200` for GPT-3, `2400` for GPT-4 |
| `VISION_MAX_TOKENS` | Upper bound on how many tokens vision models will return | `300` for gpt-4-vision-preview |
| `VISION_MODEL` | The Vision to Speech model to use. Allowed values: `gpt-4-vision-preview` | `gpt-4-vision-preview` |
| `VISION_MAX_TOKENS` | Upper bound on how many tokens vision models will return | `300` for gpt-4o |
| `VISION_MODEL` | The Vision to Speech model to use. Allowed values: `gpt-4o` | `gpt-4o` |
| `ENABLE_VISION_FOLLOW_UP_QUESTIONS` | If true, once you send an image to the bot, it uses the configured VISION_MODEL until the conversation ends. Otherwise, it uses the OPENAI_MODEL to follow the conversation. Allowed values: `true` or `false` | `true` |
| `MAX_HISTORY_SIZE` | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage | `15` |
| `MAX_CONVERSATION_AGE_MINUTES` | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset | `180` |
Expand Down
2 changes: 1 addition & 1 deletion bot/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def main():
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
'vision_model': os.environ.get('VISION_MODEL', 'gpt-4-vision-preview'),
'vision_model': os.environ.get('VISION_MODEL', 'gpt-4o'),
'enable_vision_follow_up_questions': os.environ.get('ENABLE_VISION_FOLLOW_UP_QUESTIONS', 'true').lower() == 'true',
'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
'vision_detail': os.environ.get('VISION_DETAIL', 'auto'),
Expand Down
58 changes: 31 additions & 27 deletions bot/openai_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0125")
GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-turbo-preview")
GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
GPT_4_128K_MODELS = ("gpt-4-1106-preview","gpt-4-0125-preview","gpt-4-turbo-preview", "gpt-4-turbo", "gpt-4-turbo-2024-04-09")
GPT_4_VISION_MODELS = ("gpt-4o",)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gpt-4o-mini is also recommended here

GPT_4_128K_MODELS = (
"gpt-4-1106-preview", "gpt-4-0125-preview", "gpt-4-turbo-preview", "gpt-4-turbo", "gpt-4-turbo-2024-04-09")
GPT_4O_MODELS = ("gpt-4o",)
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS + GPT_4O_MODELS

Expand Down Expand Up @@ -64,13 +65,13 @@ def are_functions_available(model: str) -> bool:
if model in ("gpt-3.5-turbo-0301", "gpt-4-0314", "gpt-4-32k-0314"):
return False
# Stable models will be updated to support functions on June 27, 2023
if model in ("gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4-32k","gpt-4-1106-preview","gpt-4-0125-preview","gpt-4-turbo-preview"):
if model in (
"gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4-32k", "gpt-4-1106-preview", "gpt-4-0125-preview",
"gpt-4-turbo-preview", "gpt-4o"):
return datetime.date.today() > datetime.date(2023, 6, 27)
# Models gpt-3.5-turbo-0613 and gpt-3.5-turbo-16k-0613 will be deprecated on June 13, 2024
if model in ("gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613"):
return datetime.date.today() < datetime.date(2024, 6, 13)
if model == 'gpt-4-vision-preview':
return False
return True


Expand Down Expand Up @@ -249,7 +250,8 @@ async def __common_get_chat_response(self, chat_id: int, query: str, stream=Fals
self.conversations[chat_id] = self.conversations[chat_id][-self.config['max_history_size']:]

common_args = {
'model': self.config['model'] if not self.conversations_vision[chat_id] else self.config['vision_model'],
'model': self.config['model'] if not self.conversations_vision[chat_id] else self.config[
'vision_model'],
'messages': self.conversations[chat_id],
'temperature': self.config['temperature'],
'n': self.config['n_choices'],
Expand Down Expand Up @@ -385,7 +387,8 @@ async def transcribe(self, filename):
try:
with open(filename, "rb") as audio:
prompt_text = self.config['whisper_prompt']
result = await self.client.audio.transcriptions.create(model="whisper-1", file=audio, prompt=prompt_text)
result = await self.client.audio.transcriptions.create(model="whisper-1", file=audio,
prompt=prompt_text)
return result.text
except Exception as e:
logging.exception(e)
Expand Down Expand Up @@ -429,7 +432,7 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s
if exceeded_max_tokens or exceeded_max_history_size:
logging.info(f'Chat history for chat ID {chat_id} is too long. Summarising...')
try:

last = self.conversations[chat_id][-1]
summary = await self.__summarise(self.conversations[chat_id][:-1])
logging.debug(f'Summary: {summary}')
Expand All @@ -440,28 +443,27 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s
logging.warning(f'Error while summarising chat history: {str(e)}. Popping elements instead...')
self.conversations[chat_id] = self.conversations[chat_id][-self.config['max_history_size']:]

message = {'role':'user', 'content':content}
message = {'role': 'user', 'content': content}

common_args = {
'model': self.config['vision_model'],
'messages': self.conversations[chat_id][:-1] + [message],
'temperature': self.config['temperature'],
'n': 1, # several choices is not implemented yet
'n': 1, # several choices is not implemented yet
'max_tokens': self.config['vision_max_tokens'],
'presence_penalty': self.config['presence_penalty'],
'frequency_penalty': self.config['frequency_penalty'],
'stream': stream
}


# vision model does not yet support functions

# if self.config['enable_functions']:
# functions = self.plugin_manager.get_functions_specs()
# if len(functions) > 0:
# common_args['functions'] = self.plugin_manager.get_functions_specs()
# common_args['function_call'] = 'auto'

return await self.client.chat.completions.create(**common_args)

except openai.RateLimitError as e:
Expand All @@ -473,23 +475,21 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s
except Exception as e:
raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e


async def interpret_image(self, chat_id, fileobj, prompt=None):
"""
Interprets a given PNG image file using the Vision model.
"""
image = encode_image(fileobj)
prompt = self.config['vision_prompt'] if prompt is None else prompt

content = [{'type':'text', 'text':prompt}, {'type':'image_url', \
'image_url': {'url':image, 'detail':self.config['vision_detail'] } }]
content = [{'type': 'text', 'text': prompt}, {'type': 'image_url', \
'image_url': {'url': image,
'detail': self.config['vision_detail']}}]

response = await self.__common_get_chat_response_vision(chat_id, content)



# functions are not available for this model

# if self.config['enable_functions']:
# response, plugins_used = await self.__handle_function_call(chat_id, response)
# if is_direct_result(response):
Expand Down Expand Up @@ -532,13 +532,12 @@ async def interpret_image_stream(self, chat_id, fileobj, prompt=None):
image = encode_image(fileobj)
prompt = self.config['vision_prompt'] if prompt is None else prompt

content = [{'type':'text', 'text':prompt}, {'type':'image_url', \
'image_url': {'url':image, 'detail':self.config['vision_detail'] } }]
content = [{'type': 'text', 'text': prompt}, {'type': 'image_url', \
'image_url': {'url': image,
'detail': self.config['vision_detail']}}]

response = await self.__common_get_chat_response_vision(chat_id, content, stream=True)



# if self.config['enable_functions']:
# response, plugins_used = await self.__handle_function_call(chat_id, response, stream=True)
# if is_direct_result(response):
Expand All @@ -557,8 +556,8 @@ async def interpret_image_stream(self, chat_id, fileobj, prompt=None):
self.__add_to_history(chat_id, role="assistant", content=answer)
tokens_used = str(self.__count_tokens(self.conversations[chat_id]))

#show_plugins_used = len(plugins_used) > 0 and self.config['show_plugins_used']
#plugin_names = tuple(self.plugin_manager.get_plugin_source_name(plugin) for plugin in plugins_used)
# show_plugins_used = len(plugins_used) > 0 and self.config['show_plugins_used']
# plugin_names = tuple(self.plugin_manager.get_plugin_source_name(plugin) for plugin in plugins_used)
if self.config['show_usage']:
answer += f"\n\n---\n💰 {tokens_used} {localized_text('stats_tokens', self.config['bot_language'])}"
# if show_plugins_used:
Expand Down Expand Up @@ -651,7 +650,12 @@ def __count_tokens(self, messages) -> int:
"""
model = self.config['model']
try:
encoding = tiktoken.encoding_for_model(model)
# TODO this is a temporary workaround until tiktoken is updated
# https://github.com/n3d1117/chatgpt-telegram-bot/issues/577
if model in GPT_4O_MODELS:
encoding = tiktoken.get_encoding("p50k_base")
else:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("gpt-3.5-turbo")

Expand Down Expand Up @@ -697,15 +701,15 @@ def __count_tokens_vision(self, image_bytes: bytes) -> int:
model = self.config['vision_model']
if model not in GPT_4_VISION_MODELS:
raise NotImplementedError(f"""count_tokens_vision() is not implemented for model {model}.""")

w, h = image.size
if w > h: w, h = h, w
# this computation follows https://platform.openai.com/docs/guides/vision and https://openai.com/pricing#gpt-4-turbo
base_tokens = 85
detail = self.config['vision_detail']
if detail == 'low':
return base_tokens
elif detail == 'high' or detail == 'auto': # assuming worst cost for auto
elif detail == 'high' or detail == 'auto': # assuming worst cost for auto
f = max(w / 768, h / 2048)
if f > 1:
w, h = int(w / f), int(h / f)
Expand Down
52 changes: 41 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,44 @@
python-dotenv~=1.0.0
pydub~=0.25.1
tiktoken==0.7.0
annotated-types==0.7.0
anyio==4.4.0
async-timeout==4.0.3
backports.tarfile==1.2.0
Brotli==1.1.0
certifi==2024.7.4
charset-normalizer==3.3.2
click==8.1.7
distro==1.9.0
duckduckgo_search==5.3.1b1
exceptiongroup==1.2.2
gTTS==2.5.3
h11==0.14.0
h2==4.1.0
hpack==4.0.0
httpcore==1.0.5
httpx==0.27.0
hyperframe==6.0.1
idna==3.8
jaraco.context==6.0.1
more-itertools==10.4.0
openai==1.29.0
pillow==10.3.0
pydantic==2.8.2
pydantic_core==2.20.1
pydub==0.25.1
python-dotenv==1.0.1
python-telegram-bot==21.1.1
requests~=2.31.0
pytube==15.0.0
redis==5.0.8
regex==2024.7.24
requests==2.31.0
six==1.16.0
sniffio==1.3.1
socksio==1.0.0
spotipy==2.23.0
tenacity==8.3.0
wolframalpha~=5.0.0
duckduckgo_search==5.3.1b1
spotipy~=2.23.0
pytube~=15.0.0
gtts~=2.5.1
whois~=0.9.27
Pillow~=10.3.0
tiktoken==0.7.0
tqdm==4.66.5
typing_extensions==4.12.2
urllib3==2.2.2
whois==0.9.27
wolframalpha==5.0.1
xmltodict==0.13.0