Skip to content

Commit

Permalink
store file uploads as plain text when preserve is not required
Browse files Browse the repository at this point in the history
  • Loading branch information
nwaughachukwuma committed Dec 7, 2024
1 parent 596ea23 commit dbcc7e8
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 10 deletions.
21 changes: 14 additions & 7 deletions api/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
generate_custom_source,
)
from .utils.custom_sources.manage_attachments import ManageAttachments
from .utils.custom_sources.read_content import ReadContent
from .utils.custom_sources.save_copied_source import CopiedPasteSourceRequest, save_copied_source
from .utils.custom_sources.save_uploaded_sources import UploadedFiles
from .utils.decorators.retry_decorator import RetryConfig, retry
Expand Down Expand Up @@ -142,7 +143,7 @@ async def get_signed_url_endpoint(blobname: str):
"""

@retry(RetryConfig(max_retries=3, delay=5, backoff=1.5))
def handler():
def handler() -> str | None:
return StorageManager().get_signed_url(blobname=blobname)

url = handler()
Expand Down Expand Up @@ -226,24 +227,30 @@ async def detect_category_endpoint(request: DetectContentCategoryRequest):


@app.post("/store-file-upload", response_model=str)
async def store_file_upload(file: UploadFile, filename: str = Form(...)):
async def store_file_upload(file: UploadFile, filename: str = Form(...), preserve: bool = Form(False)):
"""
Store file uploaded from the frontend
"""
storage_manager = StorageManager()

file_obj = BytesIO(await file.read())
print(f"Storing file: {filename}. Preserve: {preserve}")

storage_manager = StorageManager()
file_exists = storage_manager.check_blob_exists(filename)
if file_exists:
return storage_manager.get_gcs_url(filename)

file_content = await ReadContent()._read_file(file, preserve=preserve)
content_type = (
file.content_type or "application/octet-stream"
if preserve or isinstance(file_content, BytesIO)
else "text/plain"
)

result = storage_manager.upload_to_gcs(
item=file_obj,
item=file_content,
blobname=f"{BLOB_BASE_URI}/{filename}",
params=UploadItemParams(
cache_control="public, max-age=31536000",
content_type=file.content_type or "application/octet-stream",
content_type=content_type,
),
)

Expand Down
16 changes: 16 additions & 0 deletions api/src/utils/custom_sources/read_content.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from io import BytesIO

from fastapi import UploadFile
from pypdf import PdfReader


Expand All @@ -19,3 +20,18 @@ def _read_pdf(self, content: bytes) -> tuple[str, PdfReader]:

def _read_txt(self, content: bytes) -> str:
return content.decode()

async def _read_file(self, file: UploadFile, preserve: bool):
file_bytes = await file.read()

if preserve:
return BytesIO(file_bytes)

if file.content_type == "application/pdf":
text_content, _ = self._read_pdf(file_bytes)
elif file.content_type == "text/plain":
text_content = self._read_txt(file_bytes)
else:
return BytesIO(file_bytes)

return text_content
6 changes: 3 additions & 3 deletions api/src/utils/decorators/retry_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
from dataclasses import dataclass
from functools import wraps
from time import sleep
from typing import Any
from typing import Any, Optional


@dataclass
class RetryConfig:
max_retries: int = 3
delay: float = 1.0
backoff: float | None = None
backoff: Optional[float] = None


def retry(retry_config: RetryConfig | None, default_return: Any = None) -> Any:
def retry(retry_config: RetryConfig | None, default_return: Any = None):
"""
Retry logic for async functions with exponential backoff.
"""
Expand Down

0 comments on commit dbcc7e8

Please sign in to comment.