Skip to content

Commit

Permalink
Use elevenlabs (#5)
Browse files Browse the repository at this point in the history
* add firestore_sdk ad session_manager

* save user chats on firestore

* pass down session_id for a deterministic workflow

* handle conversion of chat object to/fro a dict

* remove references to langchain

* reuse a previously downloaded audiofile if it's processable

* render audiocast metdata on share page

* cleanup

* temp remove audio_enchancement

* sanitize audiocast transcript

* add elevenlabs client

* add __text_to_speech_elevenlabs; cleanup

* use dry in text_to_speech

* only lint on python versions 3.11 and 3.12

* add write permission to deploy job for marocchino/sticky-pull-request-comment

* use eleven_multilingual_v2 model for improved stability, accuracy and quality
  • Loading branch information
nwaughachukwuma authored Oct 31, 2024
1 parent 82db371 commit e8b0f4f
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 64 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ jobs:
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- id: setup-python
uses: actions/setup-python@v5
with:
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: "pip" # caching pip dependencies
Expand All @@ -67,8 +68,12 @@ jobs:
runs-on: ubuntu-latest
needs: [prepare, lint]
timeout-minutes: 10
permissions:
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: "3.12"
Expand Down Expand Up @@ -104,9 +109,9 @@ jobs:
- run: curl -f "${{ steps.deploy.outputs.url }}"
- uses: marocchino/sticky-pull-request-comment@v2
with:
header: app
header: audiora
message: |
app: ${{ steps.deploy.outputs.url }} (${{ github.event.pull_request.head.sha }})
audiora: ${{ steps.deploy.outputs.url }} (${{ github.event.pull_request.head.sha }})
promote:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.11", "3.12"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
Expand Down
3 changes: 2 additions & 1 deletion pages/audiocast.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from src.env_var import APP_URL
from src.utils.main_utils import get_audiocast
from src.utils.render_audiocast import parse_ai_script


def navigate_to_home():
Expand Down Expand Up @@ -34,7 +35,7 @@ async def render_audiocast_page():

# Transcript
with st.expander("Show Transcript"):
st.write(audiocast["script"])
st.markdown(parse_ai_script(audiocast["script"]))

# Metadata
st.sidebar.subheader("Audiocast Source")
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ asyncio

openai
anthropic
elevenlabs

pyperclip
python-multipart
Expand Down
11 changes: 11 additions & 0 deletions src/services/elevenlabs_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from elevenlabs.client import ElevenLabs

from src.env_var import ELEVENLABS_API_KEY

client = ElevenLabs(
api_key=ELEVENLABS_API_KEY,
)


def get_elevenlabs_client():
return client
46 changes: 30 additions & 16 deletions src/utils/audio_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
import re
import uuid
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Tuple

from src.utils.audio_manager_utils import (
AudioManagerConfig,
AudioManagerSpeechGenerator,
ContentSplitter,
openai_voices,
)
from src.utils.audio_synthesizer import AudioSynthesizer
from src.utils.clean_tss_markup import clean_tss_markup
from src.utils.generate_speech_utils import elevenlabs_voices, openai_voices

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -42,7 +42,6 @@ async def generate_speech(self, audio_script: str):
"""
output_file = f"{self.config.outdir_base}/{str(uuid.uuid4())}.mp3"
await self.text_to_speech(audio_script, output_file)

return output_file

async def text_to_speech(self, audio_script: str, output_file: str):
Expand All @@ -56,33 +55,48 @@ async def text_to_speech(self, audio_script: str, output_file: str):
"""
tags = self._get_tags(audio_script)
audio_script = clean_tss_markup(audio_script, tags)
nway_content = self.split_content(audio_script, tags)

print(f"nway_content: {nway_content}")

if self.config.tts_provider == "openai":
return await self.__text_to_speech_openai(audio_script, output_file, tags)
audio_files = await self.__text_to_speech_openai(nway_content, tags)
elif self.config.tts_provider == "elevenlabs":
audio_files = await self.__text_to_speech_elevenlabs(nway_content, tags)
else:
raise Exception("Invalid TTS model specified")

if not audio_files:
raise Exception("No audio files were generated")

await self.__finalize(audio_files, output_file)
logger.info(f"Audio saved to {output_file}")

async def __text_to_speech_openai(
self, audio_script: str, output_file: str, tags: List[str]
):
self, nway_content: List[Tuple[str, str]], tags: List[str]
) -> List[str]:
try:
nway_content = self.split_content(audio_script, tags)
print(f"nway_content: {nway_content}")

jobs = self._prepare_speech_jobs(
nway_content, tags, openai_voices, self.config.temp_audio_dir
)

audio_files = await self._process_speech_jobs(jobs)
if not audio_files:
raise Exception("No audio files were generated")

await self.__finalize(audio_files, output_file)
logger.info(f"Audio saved to {output_file}")

return await self._process_speech_jobs(jobs, provider="openai")
except Exception as e:
raise Exception(f"Error converting text to speech with OpenAI: {str(e)}")

async def __text_to_speech_elevenlabs(
self, nway_content: List[Tuple[str, str]], tags: List[str]
) -> List[str]:
try:
jobs = self._prepare_speech_jobs(
nway_content, tags, elevenlabs_voices, self.config.temp_audio_dir
)
return await self._process_speech_jobs(jobs, provider="elevenlabs")
except Exception as e:
raise Exception(
f"Error converting text to speech with Elevenlabs: {str(e)}"
)

async def __finalize(
self, audio_files: List[str], output_file: str, enhance_audio=False
) -> None:
Expand Down
51 changes: 16 additions & 35 deletions src/utils/audio_manager_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,20 @@
from functools import partial
from itertools import cycle, islice
from pathlib import Path
from typing import Any, List, Literal, Optional, Tuple
from typing import Any, List, Optional, Tuple

from src.services.openai_client import get_openai

OpenaiVoice = Literal["onyx", "shimmer", "echo", "nova", "alloy"]
openai_voices: List[OpenaiVoice] = ["onyx", "shimmer", "echo", "nova", "alloy"]


@dataclass
class SpeechJob:
content: str
voice: OpenaiVoice
output_file: str
tag: str
index: int
from src.utils.generate_speech_utils import (
ElevenLabsVoice,
GenerateSpeech,
OpenaiVoice,
SpeechJob,
TTSProvider,
)


@dataclass
class AudioManagerConfig:
tts_provider: Optional[Literal["openai"]] = "openai"
tts_provider: Optional[TTSProvider] = "openai"
temp_audio_dir: str = field(default_factory=lambda: "/tmp/audiocast")
outdir_base: str = field(default_factory=lambda: "/tmp/audiocast/output")

Expand All @@ -48,7 +42,7 @@ def _prepare_speech_jobs(
self,
nway_content: List[Tuple[str, str]],
tags: List[str],
voices: List[Any],
voices: List[OpenaiVoice] | List[ElevenLabsVoice],
temp_audio_dir: str,
):
jobs: List[SpeechJob] = []
Expand All @@ -73,27 +67,14 @@ def _prepare_speech_jobs(

return jobs

def _generate_speech(self, job: SpeechJob) -> str:
try:
response = get_openai().audio.speech.create(
input=job.content,
model="tts-1-hd",
voice=job.voice,
)

with open(job.output_file, "wb") as file:
file.write(response.content)

print(f"Generated speech for tag {job.tag} at index {job.index}")
return job.output_file
except Exception as e:
print(f"Failed to generate speech for tag {job.tag}: {str(e)}")
return ""

async def _process_speech_jobs(self, jobs: List[SpeechJob]) -> List[str]:
async def _process_speech_jobs(
self, jobs: List[SpeechJob], provider: TTSProvider
) -> List[str]:
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(self.executor, partial(self._generate_speech, job))
loop.run_in_executor(
self.executor, partial(GenerateSpeech(provider).run, job)
)
for job in jobs
]

Expand Down
36 changes: 36 additions & 0 deletions src/utils/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import asyncio
from functools import wraps
from time import time


def process_time():
"""Print process execution time for a given function"""

def decorator(func):
if asyncio.iscoroutinefunction(func):

@wraps(func)
async def async_wrapper(*args, **kwargs):
start_time = time()
response = await func(*args, **kwargs)

time_diff = f"{(time() - start_time):.2f}s"
print(f"Execution time for {func.__name__}: {time_diff}")

return response

return async_wrapper

@wraps(func)
def wrapper(*args, **kwargs):
start_time = time()
response = func(*args, **kwargs)

time_diff = f"{(time() - start_time):.2f}s"
print(f"Execution time for {func.__name__}: {time_diff}")

return response

return wrapper

return decorator
102 changes: 102 additions & 0 deletions src/utils/generate_speech_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from dataclasses import dataclass
from io import BytesIO
from typing import Dict, List, Literal

from src.services.elevenlabs_client import get_elevenlabs_client
from src.services.openai_client import get_openai
from src.utils.decorators import process_time

TTSProvider = Literal["openai", "elevenlabs"]

OpenaiVoice = Literal["onyx", "shimmer", "echo", "nova", "alloy"]
openai_voices: List[OpenaiVoice] = ["onyx", "shimmer", "echo", "nova", "alloy"]

ElevenLabsVoice = Literal[
"Adam", "Sarah", "Laura", "Charlie", "George", "Charlotte", "Liam"
]
elevenlabs_voices: List[ElevenLabsVoice] = [
"Adam",
"Sarah",
"Laura",
"Charlie",
"George",
"Charlotte",
"Liam",
]

elevenlabs_voice_to_id: Dict[ElevenLabsVoice, str] = {
"Adam": "pNInz6obpgDQGcFmaJgB",
"Sarah": "EXAVITQu4vr4xnSDxMaL",
"Laura": "FGY2WhTYpPnrIDTdsKH5",
"Charlie": "IKne3meq5aSn9XLyUdCD",
"George": "JBFqnCBsd6RMkjVDRZzb",
"Charlotte": "XB0fDUnXU5powFXDhCwa",
"Liam": "TX3LPaxmHKxFdv7VOQHJ",
}


@dataclass
class SpeechJob:
content: str
voice: OpenaiVoice | ElevenLabsVoice
output_file: str
tag: str
index: int


class GenerateSpeech:
provider: TTSProvider

def __init__(self, provider: TTSProvider):
self.provider = provider

def run(self, job: SpeechJob):
"""Generate speech using the specified provider"""
try:
if self.provider == "elevenlabs":
content = self.__use_elevenlabs(job)
else:
content = self.__use_openai(job)

with open(job.output_file, "wb") as file:
file.write(content)

print(f"Generated speech for tag {job.tag} at index {job.index}")
return job.output_file
except Exception as e:
print(f"Failed to generate speech for tag: {job.tag}. Error: {str(e)}")
return ""

@process_time()
def __use_openai(self, job: SpeechJob):
if job.voice not in openai_voices:
raise ValueError("Wrong voice specification for openai tts")

response = get_openai().audio.speech.create(
input=job.content, model="tts-1-hd", voice=job.voice
)
return response.content

@process_time()
def __use_elevenlabs(self, job: SpeechJob):
if job.voice not in elevenlabs_voices:
raise ValueError("Wrong voice specification for elevenlabs tts")
# response = get_elevenlabs_client().text_to_speech.convert(
# model_id="eleven_turbo_v2_5", # use the turbo model for low latency
# text=job.content,
# voice_id=elevenlabs_voice_to_id[job.voice],
# output_format="mp3_22050_32",
# )
response = get_elevenlabs_client().generate(
model="eleven_multilingual_v2",
text=job.content,
voice=job.voice,
)

buffer = BytesIO()
for chunk in response:
if chunk:
buffer.write(chunk)

buffer.seek(0)
return buffer.getvalue()
Loading

0 comments on commit e8b0f4f

Please sign in to comment.