wiki_main_picture_download.py

import asyncio
from collections.abc import Sequence
import logging
from pathlib import Path

import aiofiles
import httpx
from bs4 import BeautifulSoup, element


def get_wiki_origin_picture_url(img: element.Tag) -> str:
    """get original image url from wikipedia image tag"""
    url = img["src"]
    url = url.replace(
        "https://upload.wikimedia.org/wikipedia/commons/thumb/",
        "https://upload.wikimedia.org/wikipedia/commons/",
    )
    url = url[: url.rfind("/")]
    return url


async def download_wiki_main_picture(
    url: str, client: httpx.AsyncClient
) -> tuple[bytes, bytes]:
    https_url = url.replace("http://", "https://")
    html_response = await client.get(url=https_url)
    html_response.raise_for_status()
    soup = BeautifulSoup(html_response.text, "html.parser")
    if (
        alt_page := soup.find("div", {"class": "category-redirect-header"})
    ) is not None:
        alt_page_url = "https://commons.wikimedia.org/" + alt_page.a["href"]
        html_response = await client.get(url=alt_page_url)
        html_response.raise_for_status()
        soup = BeautifulSoup(html_response.text, "html.parser")
    tab = soup.body.find("table", {"id": "wdinfobox"})
    img = None
    if tab is not None and (tab_img := tab.tbody.tr.td.img) is not None:
        img = tab_img
    elif (media := soup.body.find("div", {"id": "mw-category-media"})) is not None and (
        media_img := media.find("img")
    ) is not None:
        img = media_img
    else:
        raise ValueError(f"No image found in {url}")
    img_url = get_wiki_origin_picture_url(img)
    img_response = await client.get(img_url)
    img_response.raise_for_status()
    return html_response.content, img_response.content


async def download_images(id_urls: Sequence[str, str], dir: Path) -> None:
    dir_error = dir / "error"
    dir_html = dir / "html"
    dir_img = dir / "img"
    dir_html.mkdir(exist_ok=True, parents=True)
    dir_img.mkdir(exist_ok=True, parents=True)
    dir_error.mkdir(exist_ok=True, parents=True)
    async with httpx.AsyncClient(http2=True) as client:

        async def download_image(landmark_id: str, url: str) -> None:
            try:
                html, img = await download_wiki_main_picture(url, client)
                async with aiofiles.open(dir / f"{landmark_id}.html", "wb") as f:
                    await f.write(html)
                async with aiofiles.open(dir / f"{landmark_id}.jpg", "wb") as f:
                    await f.write(img)
                logging.info(f"Downloaded {landmark_id}")
            except Exception as e:
                logging.error(f"Error downloading {landmark_id}: {e}")
                async with aiofiles.open(dir_error / f"{landmark_id}.log", "w") as f:
                    await f.write(f"{landmark_id}: {url}\n\n{e}")

        async with asyncio.TaskGroup() as tg:
            for i, (landmark_id, url) in enumerate(id_urls):
                tg.create_task(download_image(str(landmark_id).strip(), url))
                await asyncio.sleep(0.5)
        logging.info("Finished downloading images")