n3d1117 · tianshanghong · Aug 27, 2024 · didyouexpectthat · Sep 20, 2024
diff --git a/.env.example b/.env.example
@@ -54,4 +54,4 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # TTS_PRICES=0.015,0.030
 # BOT_LANGUAGE=en
 # ENABLE_VISION_FOLLOW_UP_QUESTIONS="true"
-# VISION_MODEL="gpt-4-vision-preview"
+# VISION_MODEL="gpt-4o"
diff --git a/README.md b/README.md
@@ -99,8 +99,8 @@ Check out the [Budget Manual](https://github.com/n3d1117/chatgpt-telegram-bot/di
 | `SHOW_USAGE`                        | Whether to show OpenAI token usage information after each response                                                                                                                                                                                                                      | `false`                            |
 | `STREAM`                            | Whether to stream responses. **Note**: incompatible, if enabled, with `N_CHOICES` higher than 1                                                                                                                                                                                         | `true`                             |
 | `MAX_TOKENS`                        | Upper bound on how many tokens the ChatGPT API will return                                                                                                                                                                                                                              | `1200` for GPT-3, `2400` for GPT-4 |
-| `VISION_MAX_TOKENS`                 | Upper bound on how many tokens vision models will return                                                                                                                                                                                                                                | `300` for gpt-4-vision-preview     |
-| `VISION_MODEL`                      | The Vision to Speech model to use. Allowed values: `gpt-4-vision-preview`                                                                                                                                                                                                               | `gpt-4-vision-preview`             |
+| `VISION_MAX_TOKENS`                 | Upper bound on how many tokens vision models will return                                                                                                                                                                                                                                | `300` for gpt-4o                   |
+| `VISION_MODEL`                      | The Vision to Speech model to use. Allowed values: `gpt-4o`                                                                                                                                                                                                                             | `gpt-4o`                           |
 | `ENABLE_VISION_FOLLOW_UP_QUESTIONS` | If true, once you send an image to the bot, it uses the configured VISION_MODEL until the conversation ends. Otherwise, it uses the OPENAI_MODEL to follow the conversation. Allowed values: `true` or `false`                                                                          | `true`                             |
 | `MAX_HISTORY_SIZE`                  | Max number of messages to keep in memory, after which the conversation will be summarised to avoid excessive token usage                                                                                                                                                                | `15`                               |
 | `MAX_CONVERSATION_AGE_MINUTES`      | Maximum number of minutes a conversation should live since the last message, after which the conversation will be reset                                                                                                                                                                 | `180`                              |

diff --git a/bot/main.py b/bot/main.py
@@ -53,7 +53,7 @@ def main():
         'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
         'show_plugins_used': os.environ.get('SHOW_PLUGINS_USED', 'false').lower() == 'true',
         'whisper_prompt': os.environ.get('WHISPER_PROMPT', ''),
-        'vision_model': os.environ.get('VISION_MODEL', 'gpt-4-vision-preview'),
+        'vision_model': os.environ.get('VISION_MODEL', 'gpt-4o'),
         'enable_vision_follow_up_questions': os.environ.get('ENABLE_VISION_FOLLOW_UP_QUESTIONS', 'true').lower() == 'true',
         'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
         'vision_detail': os.environ.get('VISION_DETAIL', 'auto'),

diff --git a/bot/openai_helper.py b/bot/openai_helper.py
@@ -26,8 +26,9 @@
 GPT_3_16K_MODELS = ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0125")
 GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-turbo-preview")
 GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
-GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
-GPT_4_128K_MODELS = ("gpt-4-1106-preview","gpt-4-0125-preview","gpt-4-turbo-preview", "gpt-4-turbo", "gpt-4-turbo-2024-04-09")
+GPT_4_VISION_MODELS = ("gpt-4o",)
+GPT_4_128K_MODELS = (
+"gpt-4-1106-preview", "gpt-4-0125-preview", "gpt-4-turbo-preview", "gpt-4-turbo", "gpt-4-turbo-2024-04-09")
 GPT_4O_MODELS = ("gpt-4o",)
 GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS + GPT_4O_MODELS
 
@@ -64,13 +65,13 @@ def are_functions_available(model: str) -> bool:
     if model in ("gpt-3.5-turbo-0301", "gpt-4-0314", "gpt-4-32k-0314"):
         return False
     # Stable models will be updated to support functions on June 27, 2023
-    if model in ("gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4-32k","gpt-4-1106-preview","gpt-4-0125-preview","gpt-4-turbo-preview"):
+    if model in (
+    "gpt-3.5-turbo", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4-32k", "gpt-4-1106-preview", "gpt-4-0125-preview",
+    "gpt-4-turbo-preview", "gpt-4o"):
         return datetime.date.today() > datetime.date(2023, 6, 27)
     # Models gpt-3.5-turbo-0613 and  gpt-3.5-turbo-16k-0613 will be deprecated on June 13, 2024
     if model in ("gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613"):
         return datetime.date.today() < datetime.date(2024, 6, 13)
-    if model == 'gpt-4-vision-preview':
-        return False
     return True
 
 
@@ -249,7 +250,8 @@ async def __common_get_chat_response(self, chat_id: int, query: str, stream=Fals
                     self.conversations[chat_id] = self.conversations[chat_id][-self.config['max_history_size']:]
 
             common_args = {
-                'model': self.config['model'] if not self.conversations_vision[chat_id] else self.config['vision_model'],
+                'model': self.config['model'] if not self.conversations_vision[chat_id] else self.config[
+                    'vision_model'],
                 'messages': self.conversations[chat_id],
                 'temperature': self.config['temperature'],
                 'n': self.config['n_choices'],
@@ -385,7 +387,8 @@ async def transcribe(self, filename):
         try:
             with open(filename, "rb") as audio:
                 prompt_text = self.config['whisper_prompt']
-                result = await self.client.audio.transcriptions.create(model="whisper-1", file=audio, prompt=prompt_text)
+                result = await self.client.audio.transcriptions.create(model="whisper-1", file=audio,
+                                                                       prompt=prompt_text)
                 return result.text
         except Exception as e:
             logging.exception(e)
@@ -429,7 +432,7 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s
             if exceeded_max_tokens or exceeded_max_history_size:
                 logging.info(f'Chat history for chat ID {chat_id} is too long. Summarising...')
                 try:
-                    
+
                     last = self.conversations[chat_id][-1]
                     summary = await self.__summarise(self.conversations[chat_id][:-1])
                     logging.debug(f'Summary: {summary}')
@@ -440,28 +443,27 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s
                     logging.warning(f'Error while summarising chat history: {str(e)}. Popping elements instead...')
                     self.conversations[chat_id] = self.conversations[chat_id][-self.config['max_history_size']:]
 
-            message = {'role':'user', 'content':content}
+            message = {'role': 'user', 'content': content}
 
             common_args = {
                 'model': self.config['vision_model'],
                 'messages': self.conversations[chat_id][:-1] + [message],
                 'temperature': self.config['temperature'],
-                'n': 1, # several choices is not implemented yet
+                'n': 1,  # several choices is not implemented yet
                 'max_tokens': self.config['vision_max_tokens'],
                 'presence_penalty': self.config['presence_penalty'],
                 'frequency_penalty': self.config['frequency_penalty'],
                 'stream': stream
             }
 
-
             # vision model does not yet support functions
 
             # if self.config['enable_functions']:
             #     functions = self.plugin_manager.get_functions_specs()
             #     if len(functions) > 0:
             #         common_args['functions'] = self.plugin_manager.get_functions_specs()
             #         common_args['function_call'] = 'auto'
-            
+
             return await self.client.chat.completions.create(**common_args)
 
         except openai.RateLimitError as e:
@@ -473,23 +475,21 @@ async def __common_get_chat_response_vision(self, chat_id: int, content: list, s
         except Exception as e:
             raise Exception(f"⚠️ _{localized_text('error', bot_language)}._ ⚠️\n{str(e)}") from e
 
-
     async def interpret_image(self, chat_id, fileobj, prompt=None):
         """
         Interprets a given PNG image file using the Vision model.
         """
         image = encode_image(fileobj)
         prompt = self.config['vision_prompt'] if prompt is None else prompt
 
-        content = [{'type':'text', 'text':prompt}, {'type':'image_url', \
-                    'image_url': {'url':image, 'detail':self.config['vision_detail'] } }]
+        content = [{'type': 'text', 'text': prompt}, {'type': 'image_url', \
+                                                      'image_url': {'url': image,
+                                                                    'detail': self.config['vision_detail']}}]
 
         response = await self.__common_get_chat_response_vision(chat_id, content)
 
-
-
         # functions are not available for this model
-        
+
         # if self.config['enable_functions']:
         #     response, plugins_used = await self.__handle_function_call(chat_id, response)
         #     if is_direct_result(response):
@@ -532,13 +532,12 @@ async def interpret_image_stream(self, chat_id, fileobj, prompt=None):
         image = encode_image(fileobj)
         prompt = self.config['vision_prompt'] if prompt is None else prompt
 
-        content = [{'type':'text', 'text':prompt}, {'type':'image_url', \
-                    'image_url': {'url':image, 'detail':self.config['vision_detail'] } }]
+        content = [{'type': 'text', 'text': prompt}, {'type': 'image_url', \
+                                                      'image_url': {'url': image,
+                                                                    'detail': self.config['vision_detail']}}]
 
         response = await self.__common_get_chat_response_vision(chat_id, content, stream=True)
 
-
-
         # if self.config['enable_functions']:
         #     response, plugins_used = await self.__handle_function_call(chat_id, response, stream=True)
         #     if is_direct_result(response):
@@ -557,8 +556,8 @@ async def interpret_image_stream(self, chat_id, fileobj, prompt=None):
         self.__add_to_history(chat_id, role="assistant", content=answer)
         tokens_used = str(self.__count_tokens(self.conversations[chat_id]))
 
-        #show_plugins_used = len(plugins_used) > 0 and self.config['show_plugins_used']
-        #plugin_names = tuple(self.plugin_manager.get_plugin_source_name(plugin) for plugin in plugins_used)
+        # show_plugins_used = len(plugins_used) > 0 and self.config['show_plugins_used']
+        # plugin_names = tuple(self.plugin_manager.get_plugin_source_name(plugin) for plugin in plugins_used)
         if self.config['show_usage']:
             answer += f"\n\n---\n💰 {tokens_used} {localized_text('stats_tokens', self.config['bot_language'])}"
         #     if show_plugins_used:
@@ -651,7 +650,12 @@ def __count_tokens(self, messages) -> int:
         """
         model = self.config['model']
         try:
-            encoding = tiktoken.encoding_for_model(model)
+            # TODO this is a temporary workaround until tiktoken is updated
+            # https://github.com/n3d1117/chatgpt-telegram-bot/issues/577
+            if model in GPT_4O_MODELS:
+                encoding = tiktoken.get_encoding("p50k_base")
+            else:
+                encoding = tiktoken.encoding_for_model(model)
         except KeyError:
             encoding = tiktoken.get_encoding("gpt-3.5-turbo")
 
@@ -697,15 +701,15 @@ def __count_tokens_vision(self, image_bytes: bytes) -> int:
         model = self.config['vision_model']
         if model not in GPT_4_VISION_MODELS:
             raise NotImplementedError(f"""count_tokens_vision() is not implemented for model {model}.""")
-        
+
         w, h = image.size
         if w > h: w, h = h, w
         # this computation follows https://platform.openai.com/docs/guides/vision and https://openai.com/pricing#gpt-4-turbo
         base_tokens = 85
         detail = self.config['vision_detail']
         if detail == 'low':
             return base_tokens
-        elif detail == 'high' or detail == 'auto': # assuming worst cost for auto
+        elif detail == 'high' or detail == 'auto':  # assuming worst cost for auto
             f = max(w / 768, h / 2048)
             if f > 1:
                 w, h = int(w / f), int(h / f)

diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,44 @@
-python-dotenv~=1.0.0
-pydub~=0.25.1
-tiktoken==0.7.0
+annotated-types==0.7.0
+anyio==4.4.0
+async-timeout==4.0.3
+backports.tarfile==1.2.0
+Brotli==1.1.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+click==8.1.7
+distro==1.9.0
+duckduckgo_search==5.3.1b1
+exceptiongroup==1.2.2
+gTTS==2.5.3
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==1.0.5
+httpx==0.27.0
+hyperframe==6.0.1
+idna==3.8
+jaraco.context==6.0.1
+more-itertools==10.4.0
 openai==1.29.0
+pillow==10.3.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+python-dotenv==1.0.1
 python-telegram-bot==21.1.1
-requests~=2.31.0
+pytube==15.0.0
+redis==5.0.8
+regex==2024.7.24
+requests==2.31.0
+six==1.16.0
+sniffio==1.3.1
+socksio==1.0.0
+spotipy==2.23.0
 tenacity==8.3.0
-wolframalpha~=5.0.0
-duckduckgo_search==5.3.1b1
-spotipy~=2.23.0
-pytube~=15.0.0
-gtts~=2.5.1
-whois~=0.9.27
-Pillow~=10.3.0
+tiktoken==0.7.0
+tqdm==4.66.5
+typing_extensions==4.12.2
+urllib3==2.2.2
+whois==0.9.27
+wolframalpha==5.0.1
+xmltodict==0.13.0