Skip to content

Commit

Permalink
feat: Supports crawling classical Chinese and fixes the issue that Ch…
Browse files Browse the repository at this point in the history
…inese translation or annotations are missing
  • Loading branch information
palp1tate committed May 23, 2024
1 parent b69bb22 commit 4e7134b
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 32 deletions.
84 changes: 53 additions & 31 deletions shige.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,64 +59,83 @@ def fetch_poem_details(u):

content_tag = soup.find("div", class_="contson")
if content_tag:
poem_details["content"] = content_tag.get_text().strip().replace("\n", "").replace("\u3000", "")
poem_details["content"] = (
content_tag.get_text().strip().replace("\n", "").replace("\u3000", "")
)

trans_annotation_tag = soup.find("div", class_="contyishang")
trans_text = ""
annotation_text = ""
if trans_annotation_tag:
p_tags = trans_annotation_tag.find_all("p")
total_text = "".join(p.get_text().strip() for p in p_tags).replace("\n", "").replace("\u3000", "")
total_text = (
"".join(p.get_text().strip() for p in p_tags)
.replace("\n", "")
.replace("\u3000", "")
)
for p_tag in p_tags:
read_more_div = None
if "展开阅读全文 ∨" in total_text:
read_more_div = p_tag.find("a", text="展开阅读全文 ∨") if p_tag.find("a",
text="展开阅读全文 ∨") else read_more_div
read_more_div = (
p_tag.find("a", text="展开阅读全文 ∨")
if p_tag.find("a", text="展开阅读全文 ∨")
else read_more_div
)
if read_more_div:
href_attr = read_more_div.get("href")
match = re.search(
r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr
)
match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(
fetch_html(full_text_url), "html.parser"
soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
paragraphs = soup_.find("div", class_="contyishang").find_all(
"p"
)
paragraphs = soup_.find(
"div", class_="contyishang"
).find_all("p")
full_text = (
"".join(p.get_text().strip() for p in paragraphs)
.replace("\n", "")
.replace("▲", "")
.replace("\u3000", "")
)
match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(full_text)
match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(
full_text
)
if match:
trans_text = match.group(1).strip()
annotation_text = match.group(2).strip()
else:
match = re.compile(r"^韵译(.*?)意译(.*?)注释(.*)$", re.S).search(full_text)
match = re.compile(
r"^韵译(.*?)意译(.*?)注释(.*)$", re.S
).search(full_text)
if match:
trans_text = (
"韵译:"
+ match.group(1).strip()
+ "意译:"
+ match.group(2).strip()
"韵译:"
+ match.group(1).strip()
+ "意译:"
+ match.group(2).strip()
)
annotation_text = match.group(3).strip()
break
else:
if "译文" in p_tag.text:
trans_text += p_tag.get_text().strip().replace("译文", "").replace("\n", "").replace(
"展开阅读全文 ∨",
"").replace(
"\u3000", "")
trans_text += (
p_tag.get_text()
.strip()
.replace("译文", "")
.replace("\n", "")
.replace("展开阅读全文 ∨", "")
.replace("\u3000", "")
)
if "注释" in p_tag.text:
annotation_text += p_tag.get_text().strip().replace("注释", "").replace("\n", "").replace(
"展开阅读全文 ∨", "").replace("\u3000", "")
annotation_text += (
p_tag.get_text()
.strip()
.replace("注释", "")
.replace("\n", "")
.replace("展开阅读全文 ∨", "")
.replace("\u3000", "")
)
poem_details["trans"] = trans_text
poem_details["annotation"] = annotation_text

Expand All @@ -125,7 +144,10 @@ def fetch_poem_details(u):
for div in appreciation_divs:
label = ""
if div.find("h2") and (
"赏析" in div.find("h2").text or "鉴赏" in div.find("h2").text or "简析" in div.find("h2").text):
"赏析" in div.find("h2").text
or "鉴赏" in div.find("h2").text
or "简析" in div.find("h2").text
):
label = div.find("h2").text
if label:
div_tuple_list.append((label, div))
Expand All @@ -144,9 +166,7 @@ def fetch_poem_details(u):
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxshangxi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
paragraphs = soup_.find("div", class_="contyishang").find_all(
"p"
)
paragraphs = soup_.find("div", class_="contyishang").find_all("p")
appreciation_text = (
"".join(p.get_text().strip() for p in paragraphs)
.replace("\n", "")
Expand All @@ -162,9 +182,11 @@ def fetch_poem_details(u):
for div in background_divs:
if div.find("h2") and "创作背景" in div.find("h2").text:
background_paragraphs = div.find_all("p")
background_text = "".join(
p.get_text().strip() for p in background_paragraphs
).replace("\n", "").replace("\u3000", "")
background_text = (
"".join(p.get_text().strip() for p in background_paragraphs)
.replace("\n", "")
.replace("\u3000", "")
)
poem_details["background"] = background_text

return poem_details
Expand Down
2 changes: 1 addition & 1 deletion single_shige.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from shige import fetch_poem_details

if __name__ == '__main__':
if __name__ == "__main__":
url = input(
"Please enter the single poem URL(example:https://so.gushiwen.cn/shiwenv_45c396367f59.aspx): "
)
Expand Down

0 comments on commit 4e7134b

Please sign in to comment.