Use elevenlabs (#5)

* add firestore_sdk ad session_manager * save user chats on firestore * pass down session_id for a deterministic workflow * handle conversion of chat object to/fro a dict * remove references to langchain * reuse a previously downloaded audiofile if it's processable * render audiocast metdata on share page * cleanup * temp remove audio_enchancement * sanitize audiocast transcript * add elevenlabs client * add __text_to_speech_elevenlabs; cleanup * use dry in text_to_speech * only lint on python versions 3.11 and 3.12 * add write permission to deploy job for marocchino/sticky-pull-request-comment * use eleven_multilingual_v2 model for improved stability, accuracy and quality
nwaughachukwuma · Oct 31, 2024 · e8b0f4f · e8b0f4f
1 parent 82db371
commit e8b0f4f
Show file tree

Hide file tree

Showing 11 changed files with 219 additions and 64 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -48,8 +48,9 @@ jobs:
     timeout-minutes: 5
     steps:
       - uses: actions/checkout@v4
-      - id: setup-python
-        uses: actions/setup-python@v5
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
           cache: "pip" # caching pip dependencies
@@ -67,8 +68,12 @@ jobs:
     runs-on: ubuntu-latest
     needs: [prepare, lint]
     timeout-minutes: 10
+    permissions: 
+      pull-requests: write 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
@@ -104,9 +109,9 @@ jobs:
       - run: curl -f "${{ steps.deploy.outputs.url }}"
       - uses: marocchino/sticky-pull-request-comment@v2
         with:
-          header: app
+          header: audiora
           message: |
-            app: ${{ steps.deploy.outputs.url }} (${{ github.event.pull_request.head.sha }})
+            audiora: ${{ steps.deploy.outputs.url }} (${{ github.event.pull_request.head.sha }})
 
   promote:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.11", "3.12"]
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/pages/audiocast.py b/pages/audiocast.py
@@ -6,6 +6,7 @@
 
 from src.env_var import APP_URL
 from src.utils.main_utils import get_audiocast
+from src.utils.render_audiocast import parse_ai_script
 
 
 def navigate_to_home():
@@ -34,7 +35,7 @@ async def render_audiocast_page():
 
                 # Transcript
                 with st.expander("Show Transcript"):
-                    st.write(audiocast["script"])
+                    st.markdown(parse_ai_script(audiocast["script"]))
 
                 # Metadata
                 st.sidebar.subheader("Audiocast Source")

diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ asyncio
 
 openai
 anthropic
+elevenlabs
 
 pyperclip
 python-multipart

diff --git a/src/services/elevenlabs_client.py b/src/services/elevenlabs_client.py
@@ -0,0 +1,11 @@
+from elevenlabs.client import ElevenLabs
+
+from src.env_var import ELEVENLABS_API_KEY
+
+client = ElevenLabs(
+    api_key=ELEVENLABS_API_KEY,
+)
+
+
+def get_elevenlabs_client():
+    return client
diff --git a/src/utils/audio_manager.py b/src/utils/audio_manager.py
@@ -4,16 +4,16 @@
 import re
 import uuid
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 from src.utils.audio_manager_utils import (
     AudioManagerConfig,
     AudioManagerSpeechGenerator,
     ContentSplitter,
-    openai_voices,
 )
 from src.utils.audio_synthesizer import AudioSynthesizer
 from src.utils.clean_tss_markup import clean_tss_markup
+from src.utils.generate_speech_utils import elevenlabs_voices, openai_voices
 
 logger = logging.getLogger(__name__)
 
@@ -42,7 +42,6 @@ async def generate_speech(self, audio_script: str):
         """
         output_file = f"{self.config.outdir_base}/{str(uuid.uuid4())}.mp3"
         await self.text_to_speech(audio_script, output_file)
-
         return output_file
 
     async def text_to_speech(self, audio_script: str, output_file: str):
@@ -56,33 +55,48 @@ async def text_to_speech(self, audio_script: str, output_file: str):
         """
         tags = self._get_tags(audio_script)
         audio_script = clean_tss_markup(audio_script, tags)
+        nway_content = self.split_content(audio_script, tags)
+
+        print(f"nway_content: {nway_content}")
 
         if self.config.tts_provider == "openai":
-            return await self.__text_to_speech_openai(audio_script, output_file, tags)
+            audio_files = await self.__text_to_speech_openai(nway_content, tags)
+        elif self.config.tts_provider == "elevenlabs":
+            audio_files = await self.__text_to_speech_elevenlabs(nway_content, tags)
         else:
             raise Exception("Invalid TTS model specified")
 
+        if not audio_files:
+            raise Exception("No audio files were generated")
+
+        await self.__finalize(audio_files, output_file)
+        logger.info(f"Audio saved to {output_file}")
+
     async def __text_to_speech_openai(
-        self, audio_script: str, output_file: str, tags: List[str]
-    ):
+        self, nway_content: List[Tuple[str, str]], tags: List[str]
+    ) -> List[str]:
         try:
-            nway_content = self.split_content(audio_script, tags)
-            print(f"nway_content: {nway_content}")
-
             jobs = self._prepare_speech_jobs(
                 nway_content, tags, openai_voices, self.config.temp_audio_dir
             )
 
-            audio_files = await self._process_speech_jobs(jobs)
-            if not audio_files:
-                raise Exception("No audio files were generated")
-
-            await self.__finalize(audio_files, output_file)
-            logger.info(f"Audio saved to {output_file}")
-
+            return await self._process_speech_jobs(jobs, provider="openai")
         except Exception as e:
             raise Exception(f"Error converting text to speech with OpenAI: {str(e)}")
 
+    async def __text_to_speech_elevenlabs(
+        self, nway_content: List[Tuple[str, str]], tags: List[str]
+    ) -> List[str]:
+        try:
+            jobs = self._prepare_speech_jobs(
+                nway_content, tags, elevenlabs_voices, self.config.temp_audio_dir
+            )
+            return await self._process_speech_jobs(jobs, provider="elevenlabs")
+        except Exception as e:
+            raise Exception(
+                f"Error converting text to speech with Elevenlabs: {str(e)}"
+            )
+
     async def __finalize(
         self, audio_files: List[str], output_file: str, enhance_audio=False
     ) -> None:

diff --git a/src/utils/audio_manager_utils.py b/src/utils/audio_manager_utils.py
@@ -6,26 +6,20 @@
 from functools import partial
 from itertools import cycle, islice
 from pathlib import Path
-from typing import Any, List, Literal, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
-from src.services.openai_client import get_openai
-
-OpenaiVoice = Literal["onyx", "shimmer", "echo", "nova", "alloy"]
-openai_voices: List[OpenaiVoice] = ["onyx", "shimmer", "echo", "nova", "alloy"]
-
-
-@dataclass
-class SpeechJob:
-    content: str
-    voice: OpenaiVoice
-    output_file: str
-    tag: str
-    index: int
+from src.utils.generate_speech_utils import (
+    ElevenLabsVoice,
+    GenerateSpeech,
+    OpenaiVoice,
+    SpeechJob,
+    TTSProvider,
+)
 
 
 @dataclass
 class AudioManagerConfig:
-    tts_provider: Optional[Literal["openai"]] = "openai"
+    tts_provider: Optional[TTSProvider] = "openai"
     temp_audio_dir: str = field(default_factory=lambda: "/tmp/audiocast")
     outdir_base: str = field(default_factory=lambda: "/tmp/audiocast/output")
 
@@ -48,7 +42,7 @@ def _prepare_speech_jobs(
         self,
         nway_content: List[Tuple[str, str]],
         tags: List[str],
-        voices: List[Any],
+        voices: List[OpenaiVoice] | List[ElevenLabsVoice],
         temp_audio_dir: str,
     ):
         jobs: List[SpeechJob] = []
@@ -73,27 +67,14 @@ def _prepare_speech_jobs(
 
         return jobs
 
-    def _generate_speech(self, job: SpeechJob) -> str:
-        try:
-            response = get_openai().audio.speech.create(
-                input=job.content,
-                model="tts-1-hd",
-                voice=job.voice,
-            )
-
-            with open(job.output_file, "wb") as file:
-                file.write(response.content)
-
-            print(f"Generated speech for tag {job.tag} at index {job.index}")
-            return job.output_file
-        except Exception as e:
-            print(f"Failed to generate speech for tag {job.tag}: {str(e)}")
-            return ""
-
-    async def _process_speech_jobs(self, jobs: List[SpeechJob]) -> List[str]:
+    async def _process_speech_jobs(
+        self, jobs: List[SpeechJob], provider: TTSProvider
+    ) -> List[str]:
         loop = asyncio.get_event_loop()
         tasks = [
-            loop.run_in_executor(self.executor, partial(self._generate_speech, job))
+            loop.run_in_executor(
+                self.executor, partial(GenerateSpeech(provider).run, job)
+            )
             for job in jobs
         ]
 

diff --git a/src/utils/decorators.py b/src/utils/decorators.py
@@ -0,0 +1,36 @@
+import asyncio
+from functools import wraps
+from time import time
+
+
+def process_time():
+    """Print process execution time for a given function"""
+
+    def decorator(func):
+        if asyncio.iscoroutinefunction(func):
+
+            @wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                start_time = time()
+                response = await func(*args, **kwargs)
+
+                time_diff = f"{(time() - start_time):.2f}s"
+                print(f"Execution time for {func.__name__}: {time_diff}")
+
+                return response
+
+            return async_wrapper
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time()
+            response = func(*args, **kwargs)
+
+            time_diff = f"{(time() - start_time):.2f}s"
+            print(f"Execution time for {func.__name__}: {time_diff}")
+
+            return response
+
+        return wrapper
+
+    return decorator
diff --git a/src/utils/generate_speech_utils.py b/src/utils/generate_speech_utils.py
@@ -0,0 +1,102 @@
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Dict, List, Literal
+
+from src.services.elevenlabs_client import get_elevenlabs_client
+from src.services.openai_client import get_openai
+from src.utils.decorators import process_time
+
+TTSProvider = Literal["openai", "elevenlabs"]
+
+OpenaiVoice = Literal["onyx", "shimmer", "echo", "nova", "alloy"]
+openai_voices: List[OpenaiVoice] = ["onyx", "shimmer", "echo", "nova", "alloy"]
+
+ElevenLabsVoice = Literal[
+    "Adam", "Sarah", "Laura", "Charlie", "George", "Charlotte", "Liam"
+]
+elevenlabs_voices: List[ElevenLabsVoice] = [
+    "Adam",
+    "Sarah",
+    "Laura",
+    "Charlie",
+    "George",
+    "Charlotte",
+    "Liam",
+]
+
+elevenlabs_voice_to_id: Dict[ElevenLabsVoice, str] = {
+    "Adam": "pNInz6obpgDQGcFmaJgB",
+    "Sarah": "EXAVITQu4vr4xnSDxMaL",
+    "Laura": "FGY2WhTYpPnrIDTdsKH5",
+    "Charlie": "IKne3meq5aSn9XLyUdCD",
+    "George": "JBFqnCBsd6RMkjVDRZzb",
+    "Charlotte": "XB0fDUnXU5powFXDhCwa",
+    "Liam": "TX3LPaxmHKxFdv7VOQHJ",
+}
+
+
+@dataclass
+class SpeechJob:
+    content: str
+    voice: OpenaiVoice | ElevenLabsVoice
+    output_file: str
+    tag: str
+    index: int
+
+
+class GenerateSpeech:
+    provider: TTSProvider
+
+    def __init__(self, provider: TTSProvider):
+        self.provider = provider
+
+    def run(self, job: SpeechJob):
+        """Generate speech using the specified provider"""
+        try:
+            if self.provider == "elevenlabs":
+                content = self.__use_elevenlabs(job)
+            else:
+                content = self.__use_openai(job)
+
+            with open(job.output_file, "wb") as file:
+                file.write(content)
+
+            print(f"Generated speech for tag {job.tag} at index {job.index}")
+            return job.output_file
+        except Exception as e:
+            print(f"Failed to generate speech for tag: {job.tag}. Error: {str(e)}")
+            return ""
+
+    @process_time()
+    def __use_openai(self, job: SpeechJob):
+        if job.voice not in openai_voices:
+            raise ValueError("Wrong voice specification for openai tts")
+
+        response = get_openai().audio.speech.create(
+            input=job.content, model="tts-1-hd", voice=job.voice
+        )
+        return response.content
+
+    @process_time()
+    def __use_elevenlabs(self, job: SpeechJob):
+        if job.voice not in elevenlabs_voices:
+            raise ValueError("Wrong voice specification for elevenlabs tts")
+        # response = get_elevenlabs_client().text_to_speech.convert(
+        #     model_id="eleven_turbo_v2_5", # use the turbo model for low latency
+        #     text=job.content,
+        #     voice_id=elevenlabs_voice_to_id[job.voice],
+        #     output_format="mp3_22050_32",
+        # )
+        response = get_elevenlabs_client().generate(
+            model="eleven_multilingual_v2",
+            text=job.content,
+            voice=job.voice,
+        )
+
+        buffer = BytesIO()
+        for chunk in response:
+            if chunk:
+                buffer.write(chunk)
+
+        buffer.seek(0)
+        return buffer.getvalue()