diff --git a/api/src/main.py b/api/src/main.py index bae9123..a2a4187 100644 --- a/api/src/main.py +++ b/api/src/main.py @@ -25,6 +25,7 @@ generate_custom_source, ) from .utils.custom_sources.manage_attachments import ManageAttachments +from .utils.custom_sources.read_content import ReadContent from .utils.custom_sources.save_copied_source import CopiedPasteSourceRequest, save_copied_source from .utils.custom_sources.save_uploaded_sources import UploadedFiles from .utils.decorators.retry_decorator import RetryConfig, retry @@ -142,7 +143,7 @@ async def get_signed_url_endpoint(blobname: str): """ @retry(RetryConfig(max_retries=3, delay=5, backoff=1.5)) - def handler(): + def handler() -> str | None: return StorageManager().get_signed_url(blobname=blobname) url = handler() @@ -226,24 +227,30 @@ async def detect_category_endpoint(request: DetectContentCategoryRequest): @app.post("/store-file-upload", response_model=str) -async def store_file_upload(file: UploadFile, filename: str = Form(...)): +async def store_file_upload(file: UploadFile, filename: str = Form(...), preserve: bool = Form(False)): """ Store file uploaded from the frontend """ - storage_manager = StorageManager() - - file_obj = BytesIO(await file.read()) + print(f"Storing file: {filename}. Preserve: {preserve}") + storage_manager = StorageManager() file_exists = storage_manager.check_blob_exists(filename) if file_exists: return storage_manager.get_gcs_url(filename) + file_content = await ReadContent()._read_file(file, preserve=preserve) + content_type = ( + file.content_type or "application/octet-stream" + if preserve or isinstance(file_content, BytesIO) + else "text/plain" + ) + result = storage_manager.upload_to_gcs( - item=file_obj, + item=file_content, blobname=f"{BLOB_BASE_URI}/{filename}", params=UploadItemParams( cache_control="public, max-age=31536000", - content_type=file.content_type or "application/octet-stream", + content_type=content_type, ), ) diff --git a/api/src/utils/custom_sources/read_content.py b/api/src/utils/custom_sources/read_content.py index 295fb9d..8065bc8 100644 --- a/api/src/utils/custom_sources/read_content.py +++ b/api/src/utils/custom_sources/read_content.py @@ -1,5 +1,6 @@ from io import BytesIO +from fastapi import UploadFile from pypdf import PdfReader @@ -19,3 +20,18 @@ def _read_pdf(self, content: bytes) -> tuple[str, PdfReader]: def _read_txt(self, content: bytes) -> str: return content.decode() + + async def _read_file(self, file: UploadFile, preserve: bool): + file_bytes = await file.read() + + if preserve: + return BytesIO(file_bytes) + + if file.content_type == "application/pdf": + text_content, _ = self._read_pdf(file_bytes) + elif file.content_type == "text/plain": + text_content = self._read_txt(file_bytes) + else: + return BytesIO(file_bytes) + + return text_content diff --git a/api/src/utils/decorators/retry_decorator.py b/api/src/utils/decorators/retry_decorator.py index b48eb82..25b0016 100644 --- a/api/src/utils/decorators/retry_decorator.py +++ b/api/src/utils/decorators/retry_decorator.py @@ -2,17 +2,17 @@ from dataclasses import dataclass from functools import wraps from time import sleep -from typing import Any +from typing import Any, Optional @dataclass class RetryConfig: max_retries: int = 3 delay: float = 1.0 - backoff: float | None = None + backoff: Optional[float] = None -def retry(retry_config: RetryConfig | None, default_return: Any = None) -> Any: +def retry(retry_config: RetryConfig | None, default_return: Any = None): """ Retry logic for async functions with exponential backoff. """