diff --git a/api/src/utils/extract_url_content.py b/api/src/utils/extract_url_content.py index 02c97b4..087e3cc 100644 --- a/api/src/utils/extract_url_content.py +++ b/api/src/utils/extract_url_content.py @@ -1,4 +1,3 @@ -import re from io import BytesIO from urllib.parse import urlparse from uuid import uuid4 @@ -40,22 +39,28 @@ async def __aexit__(self): return await self.__exit__() def _clean_text(self, text: str) -> str: - """replace all whitespace sequences with a single space""" - return re.compile(r"\s+").sub(" ", text).strip() + """TODO: write text cleaning logic""" + return text.strip() async def _extract_pdf(self, content: bytes) -> tuple[str, dict]: pdf_reader = PdfReader(BytesIO(content)) - text_content = " ".join(page.extract_text() for page in pdf_reader.pages) - metadata = {"pages": len(pdf_reader.pages), "info": pdf_reader.metadata or {}} - return self._clean_text(text_content), metadata + pages: list[str] = [] + for page in pdf_reader.pages: + text = page.extract_text() + # Split into paragraphs and clean + paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] + pages.append("\n\n".join(paragraphs)) + + metadata = {**(pdf_reader.metadata or {}), "pages": pdf_reader.get_num_pages()} + return self._clean_text("\n\n".join(pages)), metadata async def _extract_html(self, content: bytes) -> tuple[str, dict]: soup = BeautifulSoup(content, "lxml") for element in soup(["script", "style", "nav", "footer"]): element.decompose() - text_content = soup.get_text(separator=" ", strip=True) + text_content = soup.get_text(separator="\n\n", strip=True) descr_tag = soup.find("meta", {"name": "description"}) metadata = { "title": soup.title.string if soup.title else "", diff --git a/app/src/lib/components/custom-source/CustomSources.svelte b/app/src/lib/components/custom-source/CustomSources.svelte index 4fd60e9..ad908b4 100644 --- a/app/src/lib/components/custom-source/CustomSources.svelte +++ b/app/src/lib/components/custom-source/CustomSources.svelte @@ -1,14 +1,14 @@ @@ -28,12 +28,10 @@ -
- {#if source.content_type === 'application/pdf'} - - {:else} - {source.content} - {/if} +
+
diff --git a/app/src/lib/components/custom-source/RenderAudioSource.svelte b/app/src/lib/components/custom-source/RenderAudioSource.svelte index d1ea5e0..5a523b2 100644 --- a/app/src/lib/components/custom-source/RenderAudioSource.svelte +++ b/app/src/lib/components/custom-source/RenderAudioSource.svelte @@ -13,7 +13,7 @@ const { addSource } = getCustomSources(); - let snapPoints = [0.75, 0.9]; + let snapPoints = [0.75, 0.95]; let activeSnapPoint = snapPoints[0]; let fetchingSource = false; @@ -101,7 +101,7 @@ AI-generated Source
{#await parse(audioSource) then parsedContent} {@html parsedContent} diff --git a/app/src/lib/components/custom-source/RenderPDFContent.svelte b/app/src/lib/components/custom-source/RenderPDFContent.svelte deleted file mode 100644 index 172b25f..0000000 --- a/app/src/lib/components/custom-source/RenderPDFContent.svelte +++ /dev/null @@ -1,10 +0,0 @@ - - -
- {#each paragraphs as paragraph, idx (idx)} - {paragraph} - {/each} -
diff --git a/app/src/lib/components/custom-source/RenderWebContent.svelte b/app/src/lib/components/custom-source/RenderWebContent.svelte new file mode 100644 index 0000000..7b88fe6 --- /dev/null +++ b/app/src/lib/components/custom-source/RenderWebContent.svelte @@ -0,0 +1,20 @@ + + +
+ {#each paragraphs as paragraph, idx (idx)} +
+ {paragraph} +
+ {/each} +
+ +