Skip to content

Commit

Permalink
Merge pull request #373 from Lyrete/image-dl
Browse files Browse the repository at this point in the history
feat: Saving image previews
  • Loading branch information
VeckoTheGecko committed Jan 1, 2024
2 parents 5877467 + ad69ab2 commit 2a58326
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 22 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ data/channel_data.json
# Exclude script-generated files
scripts/onboarding_pr_message.md

#Exclude OG previews (build process)
static/previews

# Svelte
.DS_Store
node_modules
Expand Down
2 changes: 2 additions & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ python-dotenv
PyYAML
tqdm
validators
httpx
Pillow
33 changes: 22 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile
Expand All @@ -8,6 +8,8 @@ aiohttp==3.8.4
# via -r requirements.in
aiosignal==1.3.1
# via aiohttp
anyio==4.2.0
# via httpx
async-timeout==4.0.2
# via aiohttp
attrs==22.2.0
Expand All @@ -24,7 +26,10 @@ build==0.10.0
cachetools==5.2.0
# via google-auth
certifi==2022.12.7
# via requests
# via
# httpcore
# httpx
# requests
cfgv==3.3.1
# via pre-commit
charset-normalizer==2.1.1
Expand All @@ -39,8 +44,6 @@ decorator==5.1.1
# via validators
distlib==0.3.6
# via virtualenv
exceptiongroup==1.1.0
# via pytest
filelock==3.9.0
# via virtualenv
frozenlist==1.3.3
Expand All @@ -60,14 +63,22 @@ google-auth-httplib2==0.1.0
# via google-api-python-client
googleapis-common-protos==1.57.0
# via google-api-core
h11==0.14.0
# via httpcore
httpcore==1.0.2
# via httpx
httplib2==0.21.0
# via
# google-api-python-client
# google-auth-httplib2
httpx==0.26.0
# via -r requirements.in
identify==2.5.17
# via pre-commit
idna==3.4
# via
# anyio
# httpx
# requests
# yarl
iniconfig==2.0.0
Expand Down Expand Up @@ -96,7 +107,9 @@ pandas==1.5.3
# via -r requirements.in
pathspec==0.10.3
# via black
pip-tools==6.12.2
pillow==10.1.0
# via -r requirements.in
pip-tools==6.13.0
# via -r requirements.in
platformdirs==2.6.0
# via
Expand Down Expand Up @@ -143,14 +156,12 @@ six==1.16.0
# google-auth
# google-auth-httplib2
# python-dateutil
sniffio==1.3.0
# via
# anyio
# httpx
soupsieve==2.3.2.post1
# via beautifulsoup4
tomli==2.0.1
# via
# black
# build
# pyproject-hooks
# pytest
tqdm==4.64.1
# via -r requirements.in
uritemplate==4.1.1
Expand Down
55 changes: 46 additions & 9 deletions scripts/get_og_previews.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import hashlib
from io import BytesIO
from pathlib import Path
from urllib.request import Request, urlopen

Expand All @@ -6,8 +8,11 @@
from loguru import logger
from tqdm import tqdm
import validators
import httpx
from PIL import Image

resources_file = Path("data") / "resources.yml"
RESOURCES_FILE = Path("data") / "resources.yml"
PREVIEW_PATH = Path("static") / "previews"


# Somewhat copying the homework from this article
Expand Down Expand Up @@ -55,29 +60,61 @@ def get_og_preview(url):
# TODO: Maybe add additional behaviour to check if the image is large enough
soup = get_page(url)
image_url = get_og_image(soup)

return image_url


def save_image_as_webp(binary_image: bytes, path: Path, file_stem: str) -> Path:
full_path = path / f"{file_stem}.webp"
img = Image.open(BytesIO(binary_image))
img.save(full_path, "webp")
return full_path


def write_image_to_file(url: str, folder_path: Path) -> Path | None:
file_stem = hashlib.shake_128(url.encode("utf-8")).hexdigest(4)

# Add Mozilla header to prevent getting blocked for scraping
r = httpx.get(url, headers={"User-agent": "Mozilla/5.0"}, follow_redirects=True)

# Site had no actual image in their og_image url, so no point saving it
if r.status_code != 200:
logger.error(f"Couldn't find any image at {url}")
return None

return save_image_as_webp(r.content, folder_path, file_stem)


def main():
with resources_file.open() as f:
# Ensure the path for our previews actually exists
PREVIEW_PATH.mkdir(parents=True, exist_ok=True)

with RESOURCES_FILE.open() as f:
resources = yaml.safe_load(f)
logger.success("Read in `resources.yml` file.")

for resource in resources:
logger.info(f"Getting OG preview for {resource['url']}")

try:
image = get_og_preview(resource["url"])
image_url = get_og_preview(resource["url"])
except Exception as e:
logger.error(e)
image = None
image_url = None

if image_url is None:
continue

# Skip if URL is not valid format
if not validators.url(image_url):
continue

if image:
# Check if image is valid URL
if validators.url(image):
resource["og_preview"] = image
file_path = write_image_to_file(image_url, PREVIEW_PATH)
if file_path is None:
continue
resource["og_preview"] = file_path.name

with resources_file.open("w") as f:
with RESOURCES_FILE.open("w") as f:
yaml.dump(resources, f)

logger.success("Wrote OG previews to `resources.yml` file.")
Expand Down
2 changes: 1 addition & 1 deletion src/routes/resources/+page.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
let displayedResources = resources
let filterByTags: Resource[]
let tagLogicAnd: boolean = true // Whether all the selected tags must match the resource (vs any of the selected tags)
// TODO: make this a user preference
let tagLogic: FilterLogic
$: tagLogic = tagLogicAnd ? "and" : "or"
Expand Down
2 changes: 1 addition & 1 deletion src/routes/resources/ListItem.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
loading="lazy"
class="object-cover rounded-t-lg object-center h-48 w-full"
alt="Website preview"
src={resource.og_preview}
src="{base}/previews/{resource.og_preview}"
/>
{:else}
<div
Expand Down

0 comments on commit 2a58326

Please sign in to comment.