Skip to content

Commit

Permalink
retain formatting in web and pdf content; improve page content rendering
Browse files Browse the repository at this point in the history
  • Loading branch information
nwaughachukwuma committed Nov 21, 2024
1 parent d961077 commit 4dce53e
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 27 deletions.
19 changes: 12 additions & 7 deletions api/src/utils/extract_url_content.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from io import BytesIO
from urllib.parse import urlparse
from uuid import uuid4
Expand Down Expand Up @@ -40,22 +39,28 @@ async def __aexit__(self):
return await self.__exit__()

def _clean_text(self, text: str) -> str:
"""replace all whitespace sequences with a single space"""
return re.compile(r"\s+").sub(" ", text).strip()
"""TODO: write text cleaning logic"""
return text.strip()

async def _extract_pdf(self, content: bytes) -> tuple[str, dict]:
pdf_reader = PdfReader(BytesIO(content))
text_content = " ".join(page.extract_text() for page in pdf_reader.pages)
metadata = {"pages": len(pdf_reader.pages), "info": pdf_reader.metadata or {}}

return self._clean_text(text_content), metadata
pages: list[str] = []
for page in pdf_reader.pages:
text = page.extract_text()
# Split into paragraphs and clean
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
pages.append("\n\n".join(paragraphs))

metadata = {**(pdf_reader.metadata or {}), "pages": pdf_reader.get_num_pages()}
return self._clean_text("\n\n".join(pages)), metadata

async def _extract_html(self, content: bytes) -> tuple[str, dict]:
soup = BeautifulSoup(content, "lxml")
for element in soup(["script", "style", "nav", "footer"]):
element.decompose()

text_content = soup.get_text(separator=" ", strip=True)
text_content = soup.get_text(separator="\n\n", strip=True)
descr_tag = soup.find("meta", {"name": "description"})
metadata = {
"title": soup.title.string if soup.title else "",
Expand Down
14 changes: 6 additions & 8 deletions app/src/lib/components/custom-source/CustomSources.svelte
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
<script lang="ts">
import { getCustomSources } from '@/stores/customSources.svelte';
import * as Accordion from '../ui/accordion';
import RenderPdfContent from './RenderPDFContent.svelte';
import RenderWebContent from './RenderWebContent.svelte';
const { sources$ } = getCustomSources();
$: sources = $sources$;
function truncate(str: string, n: number) {
return str.length > n ? str.substr(0, n - 1) + '...' : str;
return str.length > n ? str.substring(0, n - 1) + '...' : str;
}
</script>

Expand All @@ -28,12 +28,10 @@
</div>
</Accordion.Trigger>
<Accordion.Content>
<div class="flex w-full flex-col gap-y-3 p-2 bg-gray-900/70 text-gray-300">
{#if source.content_type === 'application/pdf'}
<RenderPdfContent content={source.content} />
{:else}
{source.content}
{/if}
<div
class="flex w-full max-h-96 overflow-y-auto flex-col gap-y-3 p-2 bg-gray-900/70 text-gray-300"
>
<RenderWebContent content={source.content} />
</div>
</Accordion.Content>
</Accordion.Item>
Expand Down
4 changes: 2 additions & 2 deletions app/src/lib/components/custom-source/RenderAudioSource.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
const { addSource } = getCustomSources();
let snapPoints = [0.75, 0.9];
let snapPoints = [0.75, 0.95];
let activeSnapPoint = snapPoints[0];
let fetchingSource = false;
Expand Down Expand Up @@ -101,7 +101,7 @@
<Accordion.Trigger>AI-generated Source</Accordion.Trigger>
<Accordion.Content>
<article
class="prose text-gray-300 flex p-2 flex-col gap-y-3 bg-gray-900/70 text-gray-30"
class="prose max-h-96 overflow-y-auto text-gray-300 flex p-2 flex-col gap-y-3 bg-gray-900/70 text-gray-30"
>
{#await parse(audioSource) then parsedContent}
{@html parsedContent}
Expand Down
10 changes: 0 additions & 10 deletions app/src/lib/components/custom-source/RenderPDFContent.svelte

This file was deleted.

20 changes: 20 additions & 0 deletions app/src/lib/components/custom-source/RenderWebContent.svelte
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<script lang="ts">
export let content: string;
$: paragraphs = content.split('\n\n').filter((p) => p.trim());
</script>

<section class="w-full flex flex-col gap-y-3">
{#each paragraphs as paragraph, idx (idx)}
<article class="pb-2 border-b leading-relaxed border-b-gray-500/10">
{paragraph}
</article>
{/each}
</section>

<style>
article :global(code) {
background-color: rgba(62, 3, 3, 0.1);
padding: 0.25rem 0.5rem;
border-radius: 0.25rem;
}
</style>

0 comments on commit 4dce53e

Please sign in to comment.