Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow uploading attachments #24

Merged
merged 34 commits into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9a9190c
create an improved chatbox item with widgets
nwaughachukwuma Dec 5, 2024
6b2767a
cleanup examples and example_card for use in chatbox_and_widget
nwaughachukwuma Dec 5, 2024
d94cc60
hide chatbox on the home page
nwaughachukwuma Dec 5, 2024
ac29351
simplify the landing page with chat_box_and_widget
nwaughachukwuma Dec 6, 2024
613fb04
enhance ChatBoxAndWidget with search validation and additional text
nwaughachukwuma Dec 6, 2024
26b8637
allow overriding tools slot in chat_box_and_widget
nwaughachukwuma Dec 6, 2024
5441345
add logic to upload and preview file attachments
nwaughachukwuma Dec 6, 2024
8e6b0aa
improve chat_box attachment and attachment preview components
nwaughachukwuma Dec 6, 2024
88fc35f
Merge remote-tracking branch 'origin/main' into allow-uploading-attac…
nwaughachukwuma Dec 6, 2024
15c5d0b
add endpoint to store file uploads
nwaughachukwuma Dec 6, 2024
ef63b94
use the correct response_model type
nwaughachukwuma Dec 6, 2024
460a380
refactor file upload handling to check for existing blobs and return …
nwaughachukwuma Dec 6, 2024
da9fdf1
refactor /store-file-upload endpoint to only accept single file
nwaughachukwuma Dec 6, 2024
1fb3834
refactor ChatBoxAttachment to manage uploaded files with a writable s…
nwaughachukwuma Dec 6, 2024
b9c690e
refactor ChatBoxAttachment and ChatBoxAndWidget to use attachments co…
nwaughachukwuma Dec 6, 2024
bbb41e9
render loading state and file_icon
nwaughachukwuma Dec 6, 2024
80ac879
add logic and enndpoint to summarize custom sources
nwaughachukwuma Dec 6, 2024
3c94868
cleanup
nwaughachukwuma Dec 6, 2024
07a7b4b
add caching for attachments summary in chat endpoint and update syste…
nwaughachukwuma Dec 6, 2024
15e9faf
refactor attachment handling to use session-specific context and impr…
nwaughachukwuma Dec 6, 2024
26449de
fix bug and cleanup
nwaughachukwuma Dec 6, 2024
d10c2a5
keep local audio_sources in sync with db
nwaughachukwuma Dec 6, 2024
85087e7
save attachments in background tasks as link custom sources
nwaughachukwuma Dec 6, 2024
6336800
refactor custom source management to utilize FieldFilter for querying…
nwaughachukwuma Dec 6, 2024
e87e417
add GCS URL resolution and blob name extraction in storage manager
nwaughachukwuma Dec 7, 2024
e69c488
refactor audio source management and cleanup session context handling
nwaughachukwuma Dec 7, 2024
938750b
add retry decorator, create decorators dir
nwaughachukwuma Dec 7, 2024
a574075
use a backoff in get_signed_url_endpoint
nwaughachukwuma Dec 7, 2024
b4d37e6
use the new retry_decorator in get_signed_url_endpoint
nwaughachukwuma Dec 7, 2024
596ea23
default to openai for tts on dev env
nwaughachukwuma Dec 7, 2024
dbcc7e8
store file uploads as plain text when preserve is not required
nwaughachukwuma Dec 7, 2024
cc1141a
override loading message when generating first response with attachments
nwaughachukwuma Dec 7, 2024
1270122
add auto_resize logic to textarea
nwaughachukwuma Dec 7, 2024
97ab1d5
move attachment preview to the top
nwaughachukwuma Dec 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions api/src/env_var.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@

CSE_ID = environ["CSE_ID"]
CSE_API_KEY = environ["GOOGLE_API_KEY"]

PROD_ENV = environ.get("ENV", "dev") == "prod"
89 changes: 68 additions & 21 deletions api/src/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import asyncio
from io import BytesIO
from time import time
from typing import Any, Callable, Generator

Expand All @@ -7,7 +7,7 @@
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi_utilities import add_timer_middleware

from .services.storage import StorageManager
from .services.storage import BLOB_BASE_URI, StorageManager, UploadItemParams
from .utils.chat_request import chat_request
from .utils.chat_utils import (
ContentCategory,
Expand All @@ -24,14 +24,18 @@
GetCustomSourcesRequest,
generate_custom_source,
)
from .utils.custom_sources.manage_attachments import ManageAttachments
from .utils.custom_sources.read_content import ReadContent
from .utils.custom_sources.save_copied_source import CopiedPasteSourceRequest, save_copied_source
from .utils.custom_sources.save_uploaded_sources import UploadedFiles
from .utils.decorators.retry_decorator import RetryConfig, retry
from .utils.detect_content_category import DetectContentCategoryRequest, detect_content_category
from .utils.generate_audiocast import GenerateAudioCastRequest, GenerateAudiocastException, generate_audiocast
from .utils.generate_audiocast_source import GenerateAudiocastSource, generate_audiocast_source
from .utils.get_audiocast import get_audiocast
from .utils.get_session_title import GetSessionTitleModel, get_session_title
from .utils.session_manager import SessionManager, SessionModel
from .utils.summarize_custom_sources import SummarizeCustomSourcesRequest, summarize_custom_sources

app = FastAPI(title="Audiora", version="1.0.0")

Expand Down Expand Up @@ -69,26 +73,35 @@ def root():


@app.post("/chat/{session_id}", response_model=Generator[str, Any, None])
def chat_endpoint(
async def chat_endpoint(
session_id: str,
request: SessionChatRequest,
background_tasks: BackgroundTasks,
):
"""Chat endpoint"""
category = request.contentCategory
attachments = request.attachments

db = SessionManager(session_id, category)
db._add_chat(request.chatItem)

attachment_manager = ManageAttachments(session_id)
sources_summary = await attachment_manager.get_attachments_summary(db, attachments)

def on_finish(text: str):
background_tasks.add_task(db._update, {"status": "collating"})
background_tasks.add_task(
db._add_chat,
SessionChatItem(role="assistant", content=text),
)

if attachments:
background_tasks.add_task(attachment_manager.store_attachments, attachments)

response = chat_request(
content_category=category,
previous_messages=db._get_chats(),
reference_material=sources_summary,
on_finish=on_finish,
)

Expand Down Expand Up @@ -128,27 +141,22 @@ async def get_signed_url_endpoint(blobname: str):
"""
Get signed URL for generated audiocast
"""
retry_count = 0
max_retries = 3
errors: list[str] = []

while retry_count < max_retries:
try:
url = StorageManager().get_signed_url(blobname=blobname)
return JSONResponse(
content=url,
headers={
"Content-Type": "application/json",
"Cache-Control": "public, max-age=86390, immutable",
},
)
except Exception as e:
errors.append(str(e))
@retry(RetryConfig(max_retries=3, delay=5, backoff=1.5))
def handler() -> str | None:
return StorageManager().get_signed_url(blobname=blobname)

await asyncio.sleep(5)
retry_count += 1
url = handler()
if not url:
raise HTTPException(status_code=500, detail="Failed to get signed URL")

raise HTTPException(status_code=500, detail="".join(errors))
return JSONResponse(
content=url,
headers={
"Content-Type": "application/json",
"Cache-Control": "public, max-age=86390, immutable",
},
)


@app.post("/get-session-title", response_model=str)
Expand Down Expand Up @@ -216,3 +224,42 @@ async def detect_category_endpoint(request: DetectContentCategoryRequest):
Detect category of a given content
"""
return await detect_content_category(request.content)


@app.post("/store-file-upload", response_model=str)
async def store_file_upload(file: UploadFile, filename: str = Form(...), preserve: bool = Form(False)):
"""
Store file uploaded from the frontend
"""
print(f"Storing file: {filename}. Preserve: {preserve}")

storage_manager = StorageManager()
file_exists = storage_manager.check_blob_exists(filename)
if file_exists:
return storage_manager.get_gcs_url(filename)

file_content = await ReadContent()._read_file(file, preserve=preserve)
content_type = (
file.content_type or "application/octet-stream"
if preserve or isinstance(file_content, BytesIO)
else "text/plain"
)

result = storage_manager.upload_to_gcs(
item=file_content,
blobname=f"{BLOB_BASE_URI}/{filename}",
params=UploadItemParams(
cache_control="public, max-age=31536000",
content_type=content_type,
),
)

return result


@app.post("/summarize-custom-sources", response_model=str)
async def summarize_custom_sources_endpoint(request: SummarizeCustomSourcesRequest):
"""
Summarize custom sources from specified source URLs
"""
return await summarize_custom_sources(request.sourceURLs)
15 changes: 15 additions & 0 deletions api/src/services/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class UploadItemParams:


class StorageManager:
bucket_name = BUCKET_NAME

def check_blob_exists(self, filename: str, root_path=BLOB_BASE_URI):
"""check if a file exists in the bucket"""
blobname = f"{root_path}/{filename}"
Expand Down Expand Up @@ -123,3 +125,16 @@ def get_signed_url(self, blobname, expiration=datetime.timedelta(days=1)):
expiration=expiration,
method="GET",
)

def get_gcs_url(self, filename: str):
"""get full path to a file in the bucket"""
blobname = f"{BLOB_BASE_URI}/{filename}"
return f"gs://{BUCKET_NAME}/{blobname}"

def get_blob(self, blobname: str):
"""get a blob object"""
return bucket.blob(blobname)

def get_blobname_from_url(self, url: str):
"""get blobname from a URL"""
return url.replace(f"gs://{self.bucket_name}/", "")
14 changes: 9 additions & 5 deletions api/src/utils/chat_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,22 @@
from src.utils.chat_utils import ContentCategory, SessionChatItem


def get_system_message(content_category: ContentCategory):
def get_system_message(content_category: ContentCategory, reference_material: str | None = None):
return f"""
1. You're a super-intelligent AI. Your task is to understand what audiocast a user wants to listen to.
2. You will steer the conversation providing eliciting questions until you have enough context.
3. Keep the conversation exchange short, say 3-5 back and forth i.e., questions and answers.
3. If the user provides a reference material, steer the conversation based on it until you have enough context to understand what audiocast the user wants.
4. Keep the conversation exchange short, say 3-5 back and forth i.e., questions and answers.
4. As soon as you have enough context and the user's request is clear terminate the conversation by saying "Ok, thanks for clarifying! You want to listen to [Best case summary of user request so far]. Please click the button below to start generating the audiocast."
6. If the user's request remains unclear after 5 responses for clarity, terminate the conversation by saying "Your request is not very specific but from what I understand, you want to listen to [Best case summary of user request so far]. Please click the button below to start generating the audiocast."

{"REFERENCE MATERIAL: " + reference_material if reference_material else ""}

GENERAL IDEA AND WORKFLOW:
1. A user comes to you with a request for an audiocast of type {content_category}.
2. You need to ask the user questions (elicitation) to understand what kind of audiocast they want to listen to.
3. Once you have enough context, within 3-5 exchanges, you should terminate the conversation.
2. The request can include a reference material: a high-level description of the audiocast they want.
3. You will ask the user questions (elicitation) to understand what kind of audiocast they want to listen to.
4. Once you have enough context, within 3-5 exchanges, you should terminate the conversation.

IMPORTANT NOTES:
1. Your task is to understand the user's request only by eliciting questions.
Expand All @@ -28,12 +31,13 @@ def get_system_message(content_category: ContentCategory):
def chat_request(
content_category: ContentCategory,
previous_messages: List[SessionChatItem],
reference_material: Optional[str] = None,
on_finish: Optional[Callable[[str], Any]] = None,
):
response_stream = get_openai().chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": get_system_message(content_category)},
{"role": "system", "content": get_system_message(content_category, reference_material)},
*[
{"role": "user", "content": msg.content}
if msg.role == "user"
Expand Down
3 changes: 2 additions & 1 deletion api/src/utils/chat_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import uuid
from typing import Dict, List, Literal
from typing import Dict, List, Literal, Optional

from pydantic import BaseModel, Field

Expand Down Expand Up @@ -57,3 +57,4 @@ class SessionChatItem(BaseModel):
class SessionChatRequest(BaseModel):
contentCategory: ContentCategory
chatItem: SessionChatItem
attachments: Optional[List[str]] = None
17 changes: 17 additions & 0 deletions api/src/utils/custom_sources/base_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Literal, Optional, TypedDict, cast

from google.cloud.firestore_v1 import DocumentReference
from google.cloud.firestore_v1.base_query import FieldFilter
from pydantic import BaseModel

from src.services.firestore_sdk import (
Expand Down Expand Up @@ -103,3 +104,19 @@ def _get_custom_sources(self) -> list[CustomSourceModelDict]:

def _delete_custom_source(self, source_id: str):
return self._get_doc_ref(source_id).delete()

def _get_custom_source_by_url(self, url: str):
self._check_document()
try:
session_ref = self._get_collection(self.collection).document(self.doc_id)

query = session_ref.collection(self.sub_collection).where(filter=FieldFilter("url", "==", url))

docs = query.get()

for doc in docs:
if doc.exists:
return cast(CustomSourceModel, self._safe_to_dict(doc.to_dict()))
except Exception as e:
print(f"Error getting custom sources for Session: {self.doc_id}", e)
return None
17 changes: 14 additions & 3 deletions api/src/utils/custom_sources/extract_url_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from bs4 import BeautifulSoup, Tag
from pydantic import BaseModel

from src.utils.decorators import process_time
from src.services.storage import StorageManager
from src.utils.decorators.base import process_time

from .base_utils import SourceContent
from .read_content import ReadContent
Expand Down Expand Up @@ -42,14 +43,24 @@ def _extract_html(self, content: bytes) -> tuple[str, dict]:

return self._clean_text(text_content), metadata

def _resolve_gcs_url(self, url) -> str:
if url.startswith("gs://"):
storage_manager = StorageManager()
blobame = storage_manager.get_blobname_from_url(url)
return storage_manager.get_signed_url(blobame)

return url

@process_time()
def _extract(self, url: str) -> SourceContent:
parsed_url = urlparse(url)
resolved_url = self._resolve_gcs_url(url)

parsed_url = urlparse(resolved_url)
if not parsed_url.scheme or not parsed_url.netloc:
raise ValueError("Invalid URL provided")

try:
response = httpx.get(url)
response = httpx.get(resolved_url)
response.raise_for_status()
content_type = response.headers.get("content-type", "").lower()

Expand Down
11 changes: 9 additions & 2 deletions api/src/utils/custom_sources/generate_url_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ class DeleteCustomSourcesRequest(BaseModel):
sourceId: str


def generate_custom_source(request: GenerateCustomSourceRequest, background_tasks: BackgroundTasks):
def generate_custom_source(
request: GenerateCustomSourceRequest,
background_tasks: BackgroundTasks | None = None,
):
extractor = ExtractURLContent()
content = extractor._extract(request.url)

Expand All @@ -33,5 +36,9 @@ def save_to_firestore():
manager = CustomSourceManager(request.sessionId)
manager._set_custom_source(custom_source)

background_tasks.add_task(save_to_firestore)
if background_tasks:
background_tasks.add_task(save_to_firestore)
else:
save_to_firestore()

return content.model_dump()
47 changes: 47 additions & 0 deletions api/src/utils/custom_sources/manage_attachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import asyncio

from src.utils.decorators.base import use_cache_manager
from src.utils.make_seed import get_hash
from src.utils.session_manager import SessionManager
from src.utils.summarize_custom_sources import summarize_custom_sources

from .base_utils import CustomSourceManager
from .generate_url_source import GenerateCustomSourceRequest, generate_custom_source


class ManageAttachments:
def __init__(self, session_id: str):
self.session_id = session_id

async def get_attachments_summary(self, db: SessionManager, attachments: list[str] | None):
"""
Manage custom sources uploaded by the user
"""
sources_summary: str | None = None
if attachments:
attachments.sort(key=lambda x: x.lower())

@use_cache_manager(get_hash(attachments))
async def handler():
summary = await summarize_custom_sources(attachments)
db._update_source(summary)
return summary

sources_summary = await handler()

return sources_summary

async def store_attachments(self, attachments: list[str]):
"""
Store attachments as custom sources of type links
"""
cs_manager = CustomSourceManager(self.session_id)

async def _handler(url: str):
custom_source = cs_manager._get_custom_source_by_url(url)
if not custom_source:
request = GenerateCustomSourceRequest(url=url, sessionId=self.session_id)
return generate_custom_source(request)

await asyncio.gather(*[_handler(url) for url in attachments], return_exceptions=True)
return True
Loading
Loading