Merge branch 'feature/transcribe-audio-messages'

n3d1117 · Mar 4, 2023 · c164f0a · c164f0a
2 parents 71209d6 + dd12bdd
commit c164f0a
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 /__pycache__
 /.idea
 .env
+.DS_Store
diff --git a/Pipfile b/Pipfile
@@ -7,6 +7,7 @@ name = "pypi"
 requests = "*"
 python-telegram-bot = "==20.1"
 openai = "==0.27.0"
+pydub = "==0.25.1"
 python-dotenv = "*"
 
 [dev-packages]

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 A [Telegram bot](https://core.telegram.org/bots/api) that integrates with OpenAI's _official_ [ChatGPT](https://openai.com/blog/chatgpt/) APIs to provide answers. Ready to use with minimal configuration required.
 
 ## Screenshots
-![demo.pdf](https://github.com/n3d1117/chatgpt-telegram-bot/files/10876708/demo.pdf)
+![demo.pdf](https://github.com/n3d1117/chatgpt-telegram-bot/files/10889253/demo.pdf)
 
 ## Features
 - [x] Support markdown in answers
@@ -20,6 +20,7 @@ A [Telegram bot](https://core.telegram.org/bots/api) that integrates with OpenAI
 - [x] (NEW!) See token usage after each answer
 - [x] (NEW!) Multi-chat support
 - [x] (NEW!) Image generation using DALL·E via the `/image` command
+- [x] (NEW!) Transcribe audio messages using Whisper (may require [ffmpeg](https://ffmpeg.org))
 
 ## Additional Features - help needed!
 - [ ] Group chat support
@@ -117,6 +118,7 @@ docker-compose up
 ## Credits
 - [ChatGPT](https://chat.openai.com/chat) from [OpenAI](https://openai.com)
 - [python-telegram-bot](https://python-telegram-bot.org)
+- [jiaaro/pydub](https://github.com/jiaaro/pydub)
 
 ## Disclaimer
 This is a personal project and is not affiliated with OpenAI in any way.

diff --git a/openai_helper.py b/openai_helper.py
@@ -97,6 +97,17 @@ def generate_image(self, prompt: str) -> str:
             logging.exception(e)
             raise e
 
+    def transcribe(self, filename):
+        """
+        Transcribes the audio file using the Whisper model.
+        """
+        try:
+            with open(filename, "rb") as audio:
+                result = openai.Audio.transcribe("whisper-1", audio)
+                return result.text
+        except Exception as e:
+            logging.exception(e)
+            raise e
 
     def reset_chat_history(self, chat_id):
         """

diff --git a/telegram_bot.py b/telegram_bot.py
@@ -1,10 +1,12 @@
 import logging
+import os
 
 import telegram.constants as constants
 from telegram import Update
 from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters
 
 from openai_helper import OpenAIHelper
+from pydub import AudioSegment
 
 
 class ChatGPT3TelegramBot:
@@ -43,6 +45,7 @@ async def reset(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
             return
 
         logging.info(f'Resetting the conversation for user {update.message.from_user.name}...')
+
         chat_id = update.effective_chat.id
         self.openai.reset_chat_history(chat_id=chat_id)
         await context.bot.send_message(chat_id=chat_id, text='Done!')
@@ -56,6 +59,8 @@ async def image(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
             await self.send_disallowed_message(update, context)
             return
 
+        logging.info(f'New image generation request received from user {update.message.from_user.name}')
+
         chat_id = update.effective_chat.id
         image_query = update.message.text.replace('/image', '').strip()
         if image_query == '':
@@ -77,6 +82,66 @@ async def image(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
                 text='Failed to generate image'
             )
 
+    async def transcribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
+        """
+        Transcribe audio messages.
+        """
+        if not self.is_allowed(update):
+            logging.warning(f'User {update.message.from_user.name} is not allowed to transcribe audio messages')
+            await self.send_disallowed_message(update, context)
+            return
+
+        if not update.message.voice and not update.message.audio:
+            await context.bot.send_message(
+                chat_id=update.effective_chat.id,
+                reply_to_message_id=update.message.message_id,
+                text='Unsupported file type'
+            )
+            return
+
+        logging.info(f'New transcribe request received from user {update.message.from_user.name}')
+
+        chat_id = update.effective_chat.id
+        await context.bot.send_chat_action(chat_id=chat_id, action=constants.ChatAction.TYPING)
+        filename = update.message.voice.file_unique_id if update.message.voice else update.message.audio.file_unique_id
+        filename_ogg = f'{filename}.ogg'
+        filename_mp3 = f'{filename}.mp3'
+
+        try:
+            if update.message.voice:
+                audio_file = await context.bot.get_file(update.message.voice.file_id)
+                await audio_file.download_to_drive(filename_ogg)
+                ogg_audio = AudioSegment.from_ogg(filename_ogg)
+                ogg_audio.export(filename_mp3, format="mp3")
+
+            elif update.message.audio:
+                audio_file = await context.bot.get_file(update.message.audio.file_id)
+                await audio_file.download_to_drive(filename_mp3)
+
+            # Transcribe the audio file
+            transcript = self.openai.transcribe(filename_mp3)
+
+            # Send the transcript
+            await context.bot.send_message(
+                chat_id=chat_id,
+                reply_to_message_id=update.message.message_id,
+                text=transcript,
+                parse_mode=constants.ParseMode.MARKDOWN
+            )
+        except:
+            await context.bot.send_message(
+                chat_id=chat_id,
+                reply_to_message_id=update.message.message_id,
+                text='Failed to transcribe text'
+            )
+
+        finally:
+            # Cleanup files
+            if os.path.exists(filename_mp3):
+                os.remove(filename_mp3)
+            if os.path.exists(filename_ogg):
+                os.remove(filename_ogg)
+
     async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         """
         React to incoming messages and respond accordingly.
@@ -136,6 +201,7 @@ def run(self):
         application.add_handler(CommandHandler('help', self.help))
         application.add_handler(CommandHandler('image', self.image))
         application.add_handler(CommandHandler('start', self.help))
+        application.add_handler(MessageHandler(filters.VOICE | filters.AUDIO, self.transcribe))
         application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), self.prompt))
 
         application.add_error_handler(self.error_handler)