From 8664cbbfec2a64400da8c21fa5fb9e9a7ba10afb Mon Sep 17 00:00:00 2001 From: Chukwuma Nwaugha Date: Fri, 1 Nov 2024 20:37:25 +0000 Subject: [PATCH] Render audio spectogram (#6) * add firestore_sdk ad session_manager * save user chats on firestore * pass down session_id for a deterministic workflow * handle conversion of chat object to/fro a dict * remove references to langchain * reuse a previously downloaded audiofile if it's processable * render audiocast metdata on share page * cleanup * temp remove audio_enchancement * sanitize audiocast transcript * add elevenlabs client * add __text_to_speech_elevenlabs; cleanup * use dry in text_to_speech * only lint on python versions 3.11 and 3.12 * add write permission to deploy job for marocchino/sticky-pull-request-comment * use eleven_multilingual_v2 model for improved stability, accuracy and quality * Refactor audiocast page to include waveform visualization * put waveform viz in an expander * cleanup * move download_waveform_video internal to render_waveform * allow toggling waveform visualizer * save waveform to gcs * reshuffle dependencies in requirements.txt * add pycairo to deps * fix reference to pyproject.toml * add deps for cairo library --- Dockerfile | 11 ++-- pages/audiocast.py | 59 +++++++++----------- pyprojec.toml => pyproject.toml | 0 requirements.txt | 10 +++- src/services/storage.py | 24 ++++++--- src/utils/audio_to_video.py | 38 +++++++++++++ src/utils/render_audiocast.py | 34 ++---------- src/utils/render_audiocast_utils.py | 73 +++++++++++++++++++++++++ src/utils/waveform_utils.py | 83 +++++++++++++++++++++++++++++ 9 files changed, 256 insertions(+), 76 deletions(-) rename pyprojec.toml => pyproject.toml (100%) create mode 100644 src/utils/audio_to_video.py create mode 100644 src/utils/render_audiocast_utils.py create mode 100644 src/utils/waveform_utils.py diff --git a/Dockerfile b/Dockerfile index fa87d0f..ae23133 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,9 +8,14 @@ ENV PYTHONDONTWRITEBYTECODE 1 WORKDIR /app -# Install FFmpeg and any other required dependencies -RUN apt-get -yqq update && apt-get -yqq install build-essential ffmpeg && \ - rm -rf /var/lib/apt/lists/* +# Install FFmpeg, Cairo, and any other required dependencies +RUN apt-get -yqq update && apt-get -yqq install \ + build-essential \ + ffmpeg \ + libcairo2-dev \ + pkg-config \ + python3-dev \ + && rm -rf /var/lib/apt/lists/* COPY . ./ diff --git a/pages/audiocast.py b/pages/audiocast.py index 326964b..0307cc1 100644 --- a/pages/audiocast.py +++ b/pages/audiocast.py @@ -1,17 +1,15 @@ import asyncio -from pathlib import Path +from typing import cast import pyperclip import streamlit as st -from src.env_var import APP_URL from src.utils.main_utils import get_audiocast -from src.utils.render_audiocast import parse_ai_script - - -def navigate_to_home(): - main_script = str(Path(__file__).parent.parent / "app.py") - st.switch_page(main_script) +from src.utils.render_audiocast_utils import ( + GenerateAudiocastDict, + navigate_to_home, + render_audiocast_handler, +) async def render_audiocast_page(): @@ -23,40 +21,31 @@ async def render_audiocast_page(): # Display audiocast content st.title("🎧 Audiora") st.subheader("Share Page ") - - st.markdown(f"#### Viewing audiocast: {session_id}") + st.markdown(f"##### Viewing audiocast: _{session_id}_") try: with st.spinner("Loading audiocast..."): - audiocast = get_audiocast(session_id) - - # Audio player - st.audio(audiocast["url"]) - - # Transcript - with st.expander("Show Transcript"): - st.markdown(parse_ai_script(audiocast["script"])) + audiocast = cast(GenerateAudiocastDict, get_audiocast(session_id)) - # Metadata - st.sidebar.subheader("Audiocast Source") - st.sidebar.markdown(audiocast["source_content"]) + share_url = render_audiocast_handler(session_id, audiocast) - share_url = f"{APP_URL}/audiocast?session_id={session_id}" - st.text_input("Share this audiocast:", share_url) + share_col, restart_row = st.columns(2, vertical_alignment="bottom") - share_col, restart_row = st.columns(2, vertical_alignment="bottom") + with share_col: + if st.button("Copy Share link", use_container_width=True): + pyperclip.copy(share_url) + st.session_state.show_copy_success = True - with share_col: - if st.button("Copy Share link", use_container_width=True): - pyperclip.copy(share_url) - st.session_state.show_copy_success = True + with restart_row: + if st.button("Create your Audiocast", use_container_width=True): + navigate_to_home() - with restart_row: - if st.button("Create your Audiocast", use_container_width=True): - navigate_to_home() + if st.session_state.get("show_copy_success", False): + st.session_state.show_copy_succes = False + st.success("Share link copied successfully!", icon="✅") - if audiocast["created_at"]: - st.markdown(f"> Created: {audiocast["created_at"]}") + if audiocast["created_at"]: + st.markdown(f"> Created: {audiocast["created_at"]}") except Exception as e: st.error(f"Error loading audiocast: {str(e)}") @@ -67,8 +56,8 @@ async def render_audiocast_page(): st.markdown("---") - cola, _ = st.columns([3, 5]) - with cola: + col1, _ = st.columns([3, 5]) + with col1: if st.button("← Back to Home", use_container_width=True): navigate_to_home() diff --git a/pyprojec.toml b/pyproject.toml similarity index 100% rename from pyprojec.toml rename to pyproject.toml diff --git a/requirements.txt b/requirements.txt index cf5d21f..834ebf4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +pydantic + streamlit httpx asyncio @@ -6,12 +8,15 @@ openai anthropic elevenlabs -pyperclip python-multipart python-slugify python-dotenv -pydub +ffmpeg-python +pydub +pyperclip +seewav +pycairo firebase-admin google-auth @@ -19,4 +24,5 @@ google-cloud-storage google-api-python-client google-generativeai +watchdog ruff \ No newline at end of file diff --git a/src/services/storage.py b/src/services/storage.py index 1e14bca..8710062 100644 --- a/src/services/storage.py +++ b/src/services/storage.py @@ -20,13 +20,6 @@ def listBlobs(prefix): return [blob for blob in blobs] -def check_file_exists(root_path: str, filename: str): - """check if a file exists in the bucket""" - blobname = f"{root_path}/{filename}" - blobs = listBlobs(prefix=root_path) - return any(blob.name == blobname for blob in blobs) - - @dataclass class UploadItemParams: content_type: str @@ -35,6 +28,12 @@ class UploadItemParams: class StorageManager: + def check_blob_exists(self, root_path: str, filename: str): + """check if a file exists in the bucket""" + blobname = f"{root_path}/{filename}" + blobs = listBlobs(prefix=root_path) + return any(blob.name == blobname for blob in blobs) + def upload_to_gcs( self, item: str | Path | BytesIO, blobname: str, params: UploadItemParams ): @@ -66,6 +65,17 @@ def upload_audio_to_gcs(self, tmp_audio_path: str, filename=str(uuid4())): return f"gs://{BUCKET_NAME}/{blobname}" + def upload_video_to_gcs(self, tmp_video_path: str, filename=str(uuid4())): + """upload audio file to GCS""" + blobname = f"{BLOB_BASE_URI}/{filename}" + self.upload_to_gcs( + Path(tmp_video_path), + blobname, + UploadItemParams(content_type="video/mp4"), + ) + + return f"gs://{BUCKET_NAME}/{blobname}" + def download_from_gcs(self, filename: str): """ Download any item on GCS to disk diff --git a/src/utils/audio_to_video.py b/src/utils/audio_to_video.py new file mode 100644 index 0000000..4ae3fe8 --- /dev/null +++ b/src/utils/audio_to_video.py @@ -0,0 +1,38 @@ +import os +import subprocess + + +def create_video_from_audio(audio_path: str, image_path: str, output_path: str): + """Create a video with audio and spectrogram overlay.""" + cmd = [ + "ffmpeg", + "-y", + "-loop", + "1", + "-i", + image_path, + "-i", + audio_path, + "-c:v", + "libx264", + "-tune", + "stillimage", + "-c:a", + "aac", + "-b:a", + "192k", + "-pix_fmt", + "yuv420p", + "-shortest", + output_path, + ] + + try: + subprocess.run(cmd, check=True) + os.remove(image_path) # Clean up temporary spectrogram + return True + except subprocess.CalledProcessError as e: + print(f"Error during video creation: {str(e)}") + return False + except Exception as e: + print(f"Error during video creation: {str(e)}") diff --git a/src/utils/render_audiocast.py b/src/utils/render_audiocast.py index 556070a..08227cf 100644 --- a/src/utils/render_audiocast.py +++ b/src/utils/render_audiocast.py @@ -1,25 +1,13 @@ -import re -from typing import TypedDict - import pyperclip import streamlit as st -from src.env_var import APP_URL +from src.utils.render_audiocast_utils import ( + GenerateAudiocastDict, + render_audiocast_handler, +) from src.utils.session_state import reset_session -class GenerateAudiocastDict(TypedDict): - url: str - script: str - source_content: str - created_at: str | None - - -def parse_ai_script(ai_script: str): - matches = re.findall(r"<(Speaker\d+)>(.*?)", ai_script, re.DOTALL) - return "\n\n".join([f"**{speaker}**: {content}" for speaker, content in matches]) - - def render_audiocast(session_id: str): """ Render the audiocast based on the user's preferences @@ -28,19 +16,7 @@ def render_audiocast(session_id: str): st.markdown("#### Your Audiocast") current_audiocast: GenerateAudiocastDict = st.session_state.current_audiocast - # Audio player - st.audio(current_audiocast["url"]) - - # Transcript - with st.expander("Show Transcript"): - st.markdown(parse_ai_script(current_audiocast["script"])) - - # Metadata - st.sidebar.subheader("Audiocast Source") - st.sidebar.markdown(current_audiocast["source_content"]) - - share_url = f"{APP_URL}/audiocast?session_id={session_id}" - st.text_input("Share this audiocast:", share_url) + share_url = render_audiocast_handler(session_id, current_audiocast) share_col, restart_row = st.columns(2, vertical_alignment="bottom") diff --git a/src/utils/render_audiocast_utils.py b/src/utils/render_audiocast_utils.py new file mode 100644 index 0000000..857a2ee --- /dev/null +++ b/src/utils/render_audiocast_utils.py @@ -0,0 +1,73 @@ +import re +from pathlib import Path +from typing import TypedDict + +import streamlit as st + +from src.env_var import APP_URL +from src.utils.waveform_utils import render_waveform + + +def navigate_to_home(): + main_script = str(Path(__file__).parent.parent / "app.py") + st.switch_page(main_script) + + +def parse_ai_script(ai_script: str): + matches = re.findall(r"<(Speaker\d+)>(.*?)", ai_script, re.DOTALL) + return "\n\n".join([f"**{speaker}**: {content}" for speaker, content in matches]) + + +class GenerateAudiocastDict(TypedDict): + url: str + script: str + source_content: str + created_at: str | None + + +def render_audiocast_handler(session_id: str, audiocast: GenerateAudiocastDict): + # Audio player + st.audio(audiocast["url"]) + + st.markdown("---") + + col1, _ = st.columns([4, 1]) + with col1: + + def toggle_show_waveform(): + st.session_state.show_waveform = not st.session_state.get("show_waveform") + + button_label = ( + "Hide Waveform Visualization" + if st.session_state.get("show_waveform") + else "Show Waveform Visualization" + ) + + st.button( + button_label, + on_click=toggle_show_waveform, + use_container_width=True, + ) + + if st.session_state.get("show_waveform"): + try: + render_waveform(session_id, audiocast["url"]) + except Exception as e: + st.error(f"Error rendering waveform: {str(e)}") + + st.markdown("---") + + # Transcript + with st.expander("Show Transcript"): + st.markdown(parse_ai_script(audiocast["script"])) + + st.markdown("---") + + # Metadata + st.sidebar.subheader("Audiocast Source") + st.sidebar.markdown(audiocast["source_content"]) + + share_url = f"{APP_URL}/audiocast?session_id={session_id}" + st.text_input("Share this audiocast:", share_url) + + return share_url diff --git a/src/utils/waveform_utils.py b/src/utils/waveform_utils.py new file mode 100644 index 0000000..9da3f1c --- /dev/null +++ b/src/utils/waveform_utils.py @@ -0,0 +1,83 @@ +import os +import tempfile +from pathlib import Path + +import streamlit as st +from pydub import AudioSegment +from seewav import visualize + +from src.services.storage import BLOB_BASE_URI, StorageManager + + +def save_waveform_video_to_gcs(session_id: str, video_path: str): + """Ingest waveform visualization to GCS.""" + full_path = StorageManager().upload_video_to_gcs(video_path, f"{session_id}.mp4") + return full_path + + +def generate_waveform_video(output_path: Path, audio_path: str) -> Path: + """Generate waveform video from audio file using SeeWav.""" + with tempfile.TemporaryDirectory() as temp_dir: + visualize( + audio=Path(audio_path), + tmp=Path(temp_dir), + out=output_path, + bars=60, + speed=4, + time=0.4, + # rate=60, + size=(120, 68), + fg_color=(0.0, 1.0, 0.6), # Bright green. Try 0.2 0.2 0.2 for dark green + bg_color=(0.05, 0.05, 0.05), # Near black + ) + return output_path + + +def render_waveform(session_id: str, audio_path: str): + """Render waveform visualization from audio file.""" + tmp_directory = Path("/tmp/audiora/waveforms") + tmp_directory.mkdir(parents=True, exist_ok=True) + tmp_vid_path = tmp_directory / f"{session_id}.mp4" + + video_path = None + if os.path.exists(tmp_vid_path): + try: + mp4_version = AudioSegment.from_file(str(tmp_vid_path), "mp4") + if mp4_version.duration_seconds > 0: + video_path = tmp_vid_path + except Exception: + os.remove(tmp_vid_path) + else: + blobname = f"{session_id}.mp4" + exists = StorageManager().check_blob_exists(BLOB_BASE_URI, blobname) + if exists: + video_path = StorageManager().download_from_gcs(blobname) + + try: + if not video_path: + with st.spinner("Generating waveform visualization..."): + video_path = generate_waveform_video(tmp_vid_path, audio_path) + save_waveform_video_to_gcs(session_id, str(video_path)) + + # st.video(str(video_path), autoplay=True) + with open(video_path, "rb") as video_file: + video_bytes = video_file.read() + st.video(video_bytes, autoplay=True) + + download_waveform_video(str(video_path)) + except Exception as e: + st.error(f"Error generating visualization: {str(e)}") + + +def download_waveform_video(video_path: str): + """Download video with waveform""" + gen_video, _ = st.columns(2) + with gen_video: + with open(video_path, "rb") as f: + st.download_button( + label="Download Video with waveform", + data=f, + file_name="audio_visualization.mp4", + mime="video/mp4", + use_container_width=True, + )