-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_main_picture_download.py
78 lines (69 loc) · 3.02 KB
/
wiki_main_picture_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import asyncio
from collections.abc import Sequence
import logging
from pathlib import Path
import aiofiles
import httpx
from bs4 import BeautifulSoup, element
def get_wiki_origin_picture_url(img: element.Tag) -> str:
"""get original image url from wikipedia image tag"""
url = img["src"]
url = url.replace(
"https://upload.wikimedia.org/wikipedia/commons/thumb/",
"https://upload.wikimedia.org/wikipedia/commons/",
)
url = url[: url.rfind("/")]
return url
async def download_wiki_main_picture(
url: str, client: httpx.AsyncClient
) -> tuple[bytes, bytes]:
https_url = url.replace("http://", "https://")
html_response = await client.get(url=https_url)
html_response.raise_for_status()
soup = BeautifulSoup(html_response.text, "html.parser")
if (
alt_page := soup.find("div", {"class": "category-redirect-header"})
) is not None:
alt_page_url = "https://commons.wikimedia.org/" + alt_page.a["href"]
html_response = await client.get(url=alt_page_url)
html_response.raise_for_status()
soup = BeautifulSoup(html_response.text, "html.parser")
tab = soup.body.find("table", {"id": "wdinfobox"})
img = None
if tab is not None and (tab_img := tab.tbody.tr.td.img) is not None:
img = tab_img
elif (media := soup.body.find("div", {"id": "mw-category-media"})) is not None and (
media_img := media.find("img")
) is not None:
img = media_img
else:
raise ValueError(f"No image found in {url}")
img_url = get_wiki_origin_picture_url(img)
img_response = await client.get(img_url)
img_response.raise_for_status()
return html_response.content, img_response.content
async def download_images(id_urls: Sequence[str, str], dir: Path) -> None:
dir_error = dir / "error"
dir_html = dir / "html"
dir_img = dir / "img"
dir_html.mkdir(exist_ok=True, parents=True)
dir_img.mkdir(exist_ok=True, parents=True)
dir_error.mkdir(exist_ok=True, parents=True)
async with httpx.AsyncClient(http2=True) as client:
async def download_image(landmark_id: str, url: str) -> None:
try:
html, img = await download_wiki_main_picture(url, client)
async with aiofiles.open(dir / f"{landmark_id}.html", "wb") as f:
await f.write(html)
async with aiofiles.open(dir / f"{landmark_id}.jpg", "wb") as f:
await f.write(img)
logging.info(f"Downloaded {landmark_id}")
except Exception as e:
logging.error(f"Error downloading {landmark_id}: {e}")
async with aiofiles.open(dir_error / f"{landmark_id}.log", "w") as f:
await f.write(f"{landmark_id}: {url}\n\n{e}")
async with asyncio.TaskGroup() as tg:
for i, (landmark_id, url) in enumerate(id_urls):
tg.create_task(download_image(str(landmark_id).strip(), url))
await asyncio.sleep(0.5)
logging.info("Finished downloading images")