diff --git a/.gitignore b/.gitignore index 11010835..2bd7f502 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /__pycache__ /.idea .env +.DS_Store diff --git a/Pipfile b/Pipfile index 6b39d179..0b032258 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,7 @@ name = "pypi" requests = "*" python-telegram-bot = "==20.1" openai = "==0.27.0" +pydub = "==0.25.1" python-dotenv = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index a0d5db29..a03b4f3c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "30100f4525fcc93a8d88c282ae4624029444c4a0cdb683d869b4d5e52247aa1d" + "sha256": "3c6e3cca04e7c3e92b21cf609f3ed45032f55fdb7a4b211bbfec93d090e7e162" }, "pipfile-spec": 6, "requires": { @@ -469,6 +469,14 @@ "index": "pypi", "version": "==0.27.0" }, + "pydub": { + "hashes": [ + "sha256:65617e33033874b59d87db603aa1ed450633288aefead953b30bded59cb599a6", + "sha256:980a33ce9949cab2a569606b65674d748ecbca4f0796887fd6f46173a7b0d30f" + ], + "index": "pypi", + "version": "==0.25.1" + }, "python-dotenv": { "hashes": [ "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba", @@ -513,11 +521,11 @@ }, "tqdm": { "hashes": [ - "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4", - "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1" + "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5", + "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==4.64.1" + "markers": "python_version >= '3.7'", + "version": "==4.65.0" }, "urllib3": { "hashes": [ diff --git a/README.md b/README.md index 74555553..78d5f52a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A [Telegram bot](https://core.telegram.org/bots/api) that integrates with OpenAI's _official_ [ChatGPT](https://openai.com/blog/chatgpt/) APIs to provide answers. Ready to use with minimal configuration required. ## Screenshots -![demo.pdf](https://github.com/n3d1117/chatgpt-telegram-bot/files/10876708/demo.pdf) +![demo.pdf](https://github.com/n3d1117/chatgpt-telegram-bot/files/10889253/demo.pdf) ## Features - [x] Support markdown in answers @@ -20,6 +20,7 @@ A [Telegram bot](https://core.telegram.org/bots/api) that integrates with OpenAI - [x] (NEW!) See token usage after each answer - [x] (NEW!) Multi-chat support - [x] (NEW!) Image generation using DALLĀ·E via the `/image` command +- [x] (NEW!) Transcribe audio messages using Whisper (may require [ffmpeg](https://ffmpeg.org)) ## Additional Features - help needed! - [ ] Group chat support @@ -117,6 +118,7 @@ docker-compose up ## Credits - [ChatGPT](https://chat.openai.com/chat) from [OpenAI](https://openai.com) - [python-telegram-bot](https://python-telegram-bot.org) +- [jiaaro/pydub](https://github.com/jiaaro/pydub) ## Disclaimer This is a personal project and is not affiliated with OpenAI in any way. diff --git a/openai_helper.py b/openai_helper.py index 0f95b17e..69808661 100644 --- a/openai_helper.py +++ b/openai_helper.py @@ -97,6 +97,17 @@ def generate_image(self, prompt: str) -> str: logging.exception(e) raise e + def transcribe(self, filename): + """ + Transcribes the audio file using the Whisper model. + """ + try: + with open(filename, "rb") as audio: + result = openai.Audio.transcribe("whisper-1", audio) + return result.text + except Exception as e: + logging.exception(e) + raise e def reset_chat_history(self, chat_id): """ diff --git a/telegram_bot.py b/telegram_bot.py index b1a3952d..40a47ebc 100644 --- a/telegram_bot.py +++ b/telegram_bot.py @@ -1,10 +1,12 @@ import logging +import os import telegram.constants as constants from telegram import Update from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters from openai_helper import OpenAIHelper +from pydub import AudioSegment class ChatGPT3TelegramBot: @@ -43,6 +45,7 @@ async def reset(self, update: Update, context: ContextTypes.DEFAULT_TYPE): return logging.info(f'Resetting the conversation for user {update.message.from_user.name}...') + chat_id = update.effective_chat.id self.openai.reset_chat_history(chat_id=chat_id) await context.bot.send_message(chat_id=chat_id, text='Done!') @@ -56,6 +59,8 @@ async def image(self, update: Update, context: ContextTypes.DEFAULT_TYPE): await self.send_disallowed_message(update, context) return + logging.info(f'New image generation request received from user {update.message.from_user.name}') + chat_id = update.effective_chat.id image_query = update.message.text.replace('/image', '').strip() if image_query == '': @@ -77,6 +82,66 @@ async def image(self, update: Update, context: ContextTypes.DEFAULT_TYPE): text='Failed to generate image' ) + async def transcribe(self, update: Update, context: ContextTypes.DEFAULT_TYPE): + """ + Transcribe audio messages. + """ + if not self.is_allowed(update): + logging.warning(f'User {update.message.from_user.name} is not allowed to transcribe audio messages') + await self.send_disallowed_message(update, context) + return + + if not update.message.voice and not update.message.audio: + await context.bot.send_message( + chat_id=update.effective_chat.id, + reply_to_message_id=update.message.message_id, + text='Unsupported file type' + ) + return + + logging.info(f'New transcribe request received from user {update.message.from_user.name}') + + chat_id = update.effective_chat.id + await context.bot.send_chat_action(chat_id=chat_id, action=constants.ChatAction.TYPING) + filename = update.message.voice.file_unique_id if update.message.voice else update.message.audio.file_unique_id + filename_ogg = f'{filename}.ogg' + filename_mp3 = f'{filename}.mp3' + + try: + if update.message.voice: + audio_file = await context.bot.get_file(update.message.voice.file_id) + await audio_file.download_to_drive(filename_ogg) + ogg_audio = AudioSegment.from_ogg(filename_ogg) + ogg_audio.export(filename_mp3, format="mp3") + + elif update.message.audio: + audio_file = await context.bot.get_file(update.message.audio.file_id) + await audio_file.download_to_drive(filename_mp3) + + # Transcribe the audio file + transcript = self.openai.transcribe(filename_mp3) + + # Send the transcript + await context.bot.send_message( + chat_id=chat_id, + reply_to_message_id=update.message.message_id, + text=transcript, + parse_mode=constants.ParseMode.MARKDOWN + ) + except: + await context.bot.send_message( + chat_id=chat_id, + reply_to_message_id=update.message.message_id, + text='Failed to transcribe text' + ) + + finally: + # Cleanup files + if os.path.exists(filename_mp3): + os.remove(filename_mp3) + if os.path.exists(filename_ogg): + os.remove(filename_ogg) + async def prompt(self, update: Update, context: ContextTypes.DEFAULT_TYPE): """ React to incoming messages and respond accordingly. @@ -136,6 +201,7 @@ def run(self): application.add_handler(CommandHandler('help', self.help)) application.add_handler(CommandHandler('image', self.image)) application.add_handler(CommandHandler('start', self.help)) + application.add_handler(MessageHandler(filters.VOICE | filters.AUDIO, self.transcribe)) application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), self.prompt)) application.add_error_handler(self.error_handler)