Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Saving image previews #373

Merged
merged 18 commits into from
Jan 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ data/channel_data.json
# Exclude script-generated files
scripts/onboarding_pr_message.md

#Exclude OG previews (build process)
static/previews

# Svelte
.DS_Store
node_modules
Expand Down
2 changes: 2 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ python-dotenv
PyYAML
tqdm
validators
httpx
Pillow
33 changes: 22 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile
Expand All @@ -8,6 +8,8 @@ aiohttp==3.8.4
# via -r requirements.in
aiosignal==1.3.1
# via aiohttp
anyio==4.2.0
# via httpx
async-timeout==4.0.2
# via aiohttp
attrs==22.2.0
Expand All @@ -24,7 +26,10 @@ build==0.10.0
cachetools==5.2.0
# via google-auth
certifi==2022.12.7
# via requests
# via
# httpcore
# httpx
# requests
cfgv==3.3.1
# via pre-commit
charset-normalizer==2.1.1
Expand All @@ -39,8 +44,6 @@ decorator==5.1.1
# via validators
distlib==0.3.6
# via virtualenv
exceptiongroup==1.1.0
# via pytest
filelock==3.9.0
# via virtualenv
frozenlist==1.3.3
Expand All @@ -60,14 +63,22 @@ google-auth-httplib2==0.1.0
# via google-api-python-client
googleapis-common-protos==1.57.0
# via google-api-core
h11==0.14.0
# via httpcore
httpcore==1.0.2
# via httpx
httplib2==0.21.0
# via
# google-api-python-client
# google-auth-httplib2
httpx==0.26.0
# via -r requirements.in
identify==2.5.17
# via pre-commit
idna==3.4
# via
# anyio
# httpx
# requests
# yarl
iniconfig==2.0.0
Expand Down Expand Up @@ -96,7 +107,9 @@ pandas==1.5.3
# via -r requirements.in
pathspec==0.10.3
# via black
pip-tools==6.12.2
pillow==10.1.0
# via -r requirements.in
pip-tools==6.13.0
# via -r requirements.in
platformdirs==2.6.0
# via
Expand Down Expand Up @@ -143,14 +156,12 @@ six==1.16.0
# google-auth
# google-auth-httplib2
# python-dateutil
sniffio==1.3.0
# via
# anyio
# httpx
soupsieve==2.3.2.post1
# via beautifulsoup4
tomli==2.0.1
# via
# black
# build
# pyproject-hooks
# pytest
tqdm==4.64.1
# via -r requirements.in
uritemplate==4.1.1
Expand Down
55 changes: 46 additions & 9 deletions scripts/get_og_previews.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import hashlib
from io import BytesIO
from pathlib import Path
from urllib.request import Request, urlopen

Expand All @@ -6,8 +8,11 @@
from loguru import logger
from tqdm import tqdm
import validators
import httpx
from PIL import Image

resources_file = Path("data") / "resources.yml"
RESOURCES_FILE = Path("data") / "resources.yml"
PREVIEW_PATH = Path("static") / "previews"


# Somewhat copying the homework from this article
Expand Down Expand Up @@ -55,29 +60,61 @@ def get_og_preview(url):
# TODO: Maybe add additional behaviour to check if the image is large enough
soup = get_page(url)
image_url = get_og_image(soup)

return image_url


def save_image_as_webp(binary_image: bytes, path: Path, file_stem: str) -> Path:
full_path = path / f"{file_stem}.webp"
img = Image.open(BytesIO(binary_image))
img.save(full_path, "webp")
return full_path


def write_image_to_file(url: str, folder_path: Path) -> Path | None:
file_stem = hashlib.shake_128(url.encode("utf-8")).hexdigest(4)

# Add Mozilla header to prevent getting blocked for scraping
r = httpx.get(url, headers={"User-agent": "Mozilla/5.0"}, follow_redirects=True)

# Site had no actual image in their og_image url, so no point saving it
if r.status_code != 200:
logger.error(f"Couldn't find any image at {url}")
return None

return save_image_as_webp(r.content, folder_path, file_stem)


def main():
with resources_file.open() as f:
# Ensure the path for our previews actually exists
PREVIEW_PATH.mkdir(parents=True, exist_ok=True)

with RESOURCES_FILE.open() as f:
resources = yaml.safe_load(f)
logger.success("Read in `resources.yml` file.")

for resource in resources:
logger.info(f"Getting OG preview for {resource['url']}")

try:
image = get_og_preview(resource["url"])
image_url = get_og_preview(resource["url"])
except Exception as e:
logger.error(e)
image = None
image_url = None

if image_url is None:
continue

# Skip if URL is not valid format
if not validators.url(image_url):
continue

if image:
# Check if image is valid URL
if validators.url(image):
resource["og_preview"] = image
file_path = write_image_to_file(image_url, PREVIEW_PATH)
if file_path is None:
continue
resource["og_preview"] = file_path.name

with resources_file.open("w") as f:
with RESOURCES_FILE.open("w") as f:
yaml.dump(resources, f)

logger.success("Wrote OG previews to `resources.yml` file.")
Expand Down
2 changes: 1 addition & 1 deletion src/routes/resources/+page.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
let displayedResources = resources
let filterByTags: Resource[]
let tagLogicAnd: boolean = true // Whether all the selected tags must match the resource (vs any of the selected tags)
// TODO: make this a user preference

let tagLogic: FilterLogic
$: tagLogic = tagLogicAnd ? "and" : "or"

Expand Down
2 changes: 1 addition & 1 deletion src/routes/resources/ListItem.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
loading="lazy"
class="object-cover rounded-t-lg object-center h-48 w-full"
alt="Website preview"
src={resource.og_preview}
src="{base}/previews/{resource.og_preview}"
/>
{:else}
<div
Expand Down
Loading