Skip to content

Commit

Permalink
refactor: constants and improve error handling
Browse files Browse the repository at this point in the history
Moved regex patterns, extra text, and HTML parser to top-level constants for reuse. Replaced exception raising with a print statement and default return in utils.py for graceful handling of configuration load errors. Updated file paths in CSV scripts to use a top-level constant.
  • Loading branch information
palp1tate committed Jun 26, 2024
1 parent 3f6bd9a commit dadc400
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 27 deletions.
43 changes: 23 additions & 20 deletions shige.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@
import re
from bs4 import BeautifulSoup

extra_text = "展开阅读全文 ∨"
parser = "html.parser"
regex_list = [
r"^韵译(.*?)意译(.*?)注释(.*?)$",
r"^直译(.*?)韵译(.*?)注释(.*?)$",
r"^译文(.*?)注释(.*?)$",
r"^译文(.*?)注解(.*?)$",
r"^韵译(.*?)注解(.*?)$",
r"^韵译(.*?)注释(.*?)$",
]
label_list = ["译文及注释", "注解及译文"]


def fetch_html(u):
try:
Expand All @@ -14,7 +26,7 @@ def fetch_html(u):


def extract_poem_urls(html_detail):
soup = BeautifulSoup(html_detail, "html.parser")
soup = BeautifulSoup(html_detail, parser)
poems = []
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
Expand Down Expand Up @@ -61,7 +73,7 @@ def fetch_poem_details(u):
"background": "",
}

soup = BeautifulSoup(fetch_html(u), "html.parser")
soup = BeautifulSoup(fetch_html(u), parser)
title_tag = soup.find("h1")
if title_tag:
poem_details["name"] = title_tag.text.strip().replace("\n", "")
Expand Down Expand Up @@ -93,15 +105,6 @@ def fetch_poem_details(u):
trans_annotation_tags = soup.find_all("div", class_="contyishang")
trans_text = ""
annotation_text = ""
regex_list = [
r"^韵译(.*?)意译(.*?)注释(.*?)$",
r"^直译(.*?)韵译(.*?)注释(.*?)$",
r"^译文(.*?)注释(.*?)$",
r"^译文(.*?)注解(.*?)$",
r"^韵译(.*?)注解(.*?)$",
r"^韵译(.*?)注释(.*?)$",
]
label_list = ["译文及注释", "注解及译文"]
for trans_annotation_tag in trans_annotation_tags:
for l in label_list:
if l in trans_annotation_tag.get_text():
Expand All @@ -114,15 +117,15 @@ def fetch_poem_details(u):
.replace("▲", "")
.strip()
)
if "展开阅读全文 ∨" in total_text:
a_tag = trans_annotation_tag.find("a", text="展开阅读全文 ∨")
if extra_text in total_text:
a_tag = trans_annotation_tag.find("a", text=extra_text)
href_attr = a_tag.get("href")
match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
soup_ = BeautifulSoup(fetch_html(full_text_url), parser)
t_a_tag = soup_.find("div", class_="contyishang")
full_text = (
t_a_tag.get_text()
Expand Down Expand Up @@ -162,15 +165,15 @@ def fetch_poem_details(u):
.replace("▲", "")
.strip()
)
if "展开阅读全文 ∨" in total_text:
a_tag = trans_annotation_tag.find("a", text="展开阅读全文 ∨")
if extra_text in total_text:
a_tag = trans_annotation_tag.find("a", text=extra_text)
href_attr = a_tag.get("href")
match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
soup_ = BeautifulSoup(fetch_html(full_text_url), parser)
t_a_tag = soup_.find("div", class_="contyishang")
full_text = (
t_a_tag.get_text()
Expand Down Expand Up @@ -209,16 +212,16 @@ def fetch_poem_details(u):
appreciation_text = "".join(
p.get_text().strip() for p in appreciation_paragraphs
).replace("\n", "")
if "展开阅读全文 ∨" in appreciation_text:
read_more_div = div.find("a", text="展开阅读全文 ∨")
if extra_text in appreciation_text:
read_more_div = div.find("a", text=extra_text)
if read_more_div:
href_attr = read_more_div.get("href")
match = re.search(r"shangxiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxshangxi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
soup_ = BeautifulSoup(fetch_html(full_text_url), parser)
paragraphs = soup_.find("div", class_="contyishang").find_all("p")
appreciation_text = (
"".join(p.get_text().strip() for p in paragraphs)
Expand Down
8 changes: 5 additions & 3 deletions shige_csv.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
from shige import fetch_html, extract_poem_urls, fetch_poem_details

file_path = "poems.csv"

if __name__ == "__main__":
if not os.path.exists("poem.csv"):
with open("poem.csv", "w", encoding="utf-8") as f:
if not os.path.exists(file_path):
with open(file_path, "w", encoding="utf-8") as f:
f.write(
"name,author,dynasty,content,trans,annotation,appreciation,background\n"
)
Expand All @@ -19,7 +21,7 @@

for url in poem_urls:
details = fetch_poem_details(url)
with open("poem.csv", "a", encoding="utf-8") as f:
with open(file_path, "a", encoding="utf-8") as f:
print(f"Writing details for poem: {details['name']}")
for key in details:
details[key] = details[key].replace("\xa0", "")
Expand Down
8 changes: 5 additions & 3 deletions single_shige_csv.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
from shige import fetch_poem_details

file_path = "poems.csv"

if __name__ == "__main__":
if not os.path.exists("poem.csv"):
with open("poem.csv", "w", encoding="utf-8") as f:
if not os.path.exists(file_path):
with open(file_path, "w", encoding="utf-8") as f:
f.write(
"name,author,dynasty,content,trans,annotation,appreciation,background\n"
)
Expand All @@ -12,7 +14,7 @@
)

details = fetch_poem_details(url)
with open("poem.csv", "a", encoding="utf-8") as f:
with open(file_path, "a", encoding="utf-8") as f:
print(f"Writing details for poem: {details['name']}")
for key in details:
details[key] = details[key].replace("\xa0", "")
Expand Down
3 changes: 2 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ def load_configuration(file_path: str) -> dict:
with open(file_path, "r") as f:
return yaml.safe_load(f)
except Exception as exc:
raise Exception(f"Failed to load configuration from {file_path}: {exc}")
print(f"Error occurred while loading configuration: {exc}")
return {}


def init_engine():
Expand Down

0 comments on commit dadc400

Please sign in to comment.