From f5bab66934ff2e5fa76f880f27a17871752fbdb4 Mon Sep 17 00:00:00 2001
From: wxy <1939311091@qq.com>
Date: Fri, 24 May 2024 15:47:42 +0800
Subject: [PATCH] fix: Add extract_text function and revise fetch_poem_details

Added a new function extract_text for extracting translation and annotation texts. Revised fetch_poem_details function to improve extraction of content, translation, and annotation from poem details. The new implementation increases flexibility and handles different formats of translations and annotations more accurately.
---
 shige.py | 173 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 113 insertions(+), 60 deletions(-)

diff --git a/shige.py b/shige.py
index 1db4441..933c8a4 100644
--- a/shige.py
+++ b/shige.py
@@ -26,6 +26,29 @@ def extract_poem_urls(html_detail):
     return poems
 
 
+def extract_text(regex, total_text):
+    trans_text = ""
+    annotation_text = ""
+    flag = False
+    match = re.compile(regex, re.S).search(total_text)
+    if match:
+        flag = True
+        if regex == r"^韵译(.*?)意译(.*?)注释(.*?)$":
+            trans_text = (
+                "韵译：" + match.group(1).strip() + "意译：" + match.group(2).strip()
+            )
+            annotation_text = match.group(3).strip()
+        elif regex == r"^直译(.*?)韵译(.*?)注释(.*?)$":
+            trans_text = (
+                "直译：" + match.group(1).strip() + "韵译：" + match.group(2).strip()
+            )
+            annotation_text = match.group(3).strip()
+        else:
+            trans_text = match.group(1).strip()
+            annotation_text = match.group(2).strip()
+    return flag, trans_text, annotation_text
+
+
 def fetch_poem_details(u):
     poem_details = {
         "name": "",
@@ -60,82 +83,112 @@ def fetch_poem_details(u):
     content_tag = soup.find("div", class_="contson")
     if content_tag:
         poem_details["content"] = (
-            content_tag.get_text().strip().replace("\n", "").replace("\u3000", "")
+            content_tag.get_text()
+            .strip()
+            .replace("\n", "")
+            .replace("\u3000", "")
+            .replace(" ", "")
         )
 
-    trans_annotation_tag = soup.find("div", class_="contyishang")
+    trans_annotation_tags = soup.find_all("div", class_="contyishang")
     trans_text = ""
     annotation_text = ""
-    if trans_annotation_tag:
-        p_tags = trans_annotation_tag.find_all("p")
-        total_text = (
-            "".join(p.get_text().strip() for p in p_tags)
-            .replace("\n", "")
-            .replace("\u3000", "")
-        )
-        for p_tag in p_tags:
-            read_more_div = None
-            if "展开阅读全文 ∨" in total_text:
-                read_more_div = (
-                    p_tag.find("a", text="展开阅读全文 ∨")
-                    if p_tag.find("a", text="展开阅读全文 ∨")
-                    else read_more_div
+    regex_list = [
+        r"^韵译(.*?)意译(.*?)注释(.*?)$",
+        r"^直译(.*?)韵译(.*?)注释(.*?)$",
+        r"^译文(.*?)注释(.*?)$",
+        r"^译文(.*?)注解(.*?)$",
+        r"^韵译(.*?)注解(.*?)$",
+        r"^韵译(.*?)注释(.*?)$",
+    ]
+    label_list = ["译文及注释", "注解及译文"]
+    for trans_annotation_tag in trans_annotation_tags:
+        for l in label_list:
+            if l in trans_annotation_tag.get_text():
+                total_text = (
+                    trans_annotation_tag.get_text()
+                    .replace(l, "")
+                    .replace("\n", "")
+                    .replace("\u3000", "")
+                    .replace("\u200b", "")
+                    .replace("▲", "")
+                    .strip()
                 )
-                if read_more_div:
-                    href_attr = read_more_div.get("href")
+                if "展开阅读全文 ∨" in total_text:
+                    a_tag = trans_annotation_tag.find("a", text="展开阅读全文 ∨")
+                    href_attr = a_tag.get("href")
                     match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
                     if match:
                         number = match.group(1)
                         string = match.group(2)
                         full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
                         soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
-                        paragraphs = soup_.find("div", class_="contyishang").find_all(
-                            "p"
-                        )
+                        t_a_tag = soup_.find("div", class_="contyishang")
                         full_text = (
-                            "".join(p.get_text().strip() for p in paragraphs)
+                            t_a_tag.get_text()
+                            .replace("译文及注释", "")
                             .replace("\n", "")
-                            .replace("▲", "")
                             .replace("\u3000", "")
+                            .replace("\u200b", "")
+                            .replace("▲", "")
+                            .replace(" ", "")
+                            .strip()
                         )
-                        match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(
-                            full_text
-                        )
-                        if match:
-                            trans_text = match.group(1).strip()
-                            annotation_text = match.group(2).strip()
-                        else:
-                            match = re.compile(
-                                r"^韵译(.*?)意译(.*?)注释(.*)$", re.S
-                            ).search(full_text)
-                            if match:
-                                trans_text = (
-                                    "韵译："
-                                    + match.group(1).strip()
-                                    + "意译："
-                                    + match.group(2).strip()
+                        flag = False
+                        for regex in regex_list:
+                            if flag:
+                                break
+                            else:
+                                flag, trans_text, annotation_text = extract_text(
+                                    regex, full_text
                                 )
-                                annotation_text = match.group(3).strip()
-                    break
-            else:
-                if "译文" in p_tag.text:
-                    trans_text += (
-                        p_tag.get_text()
-                        .strip()
-                        .replace("译文", "")
-                        .replace("\n", "")
-                        .replace("展开阅读全文 ∨", "")
-                        .replace("\u3000", "")
-                    )
-                if "注释" in p_tag.text:
-                    annotation_text += (
-                        p_tag.get_text()
-                        .strip()
-                        .replace("注释", "")
-                        .replace("\n", "")
-                        .replace("展开阅读全文 ∨", "")
-                        .replace("\u3000", "")
-                    )
+                else:
+                    flag = False
+                    for regex in regex_list:
+                        if flag:
+                            break
+                        else:
+                            flag, trans_text, annotation_text = extract_text(
+                                regex, total_text.replace(" ", "")
+                            )
+        if not annotation_text:
+            if "注释" in trans_annotation_tag.get_text():
+                total_text = (
+                    trans_annotation_tag.get_text()
+                    .replace("注释", "")
+                    .replace("\n", "")
+                    .replace("\u3000", "")
+                    .replace("\u200b", "")
+                    .replace("▲", "")
+                    .strip()
+                )
+                if "展开阅读全文 ∨" in total_text:
+                    a_tag = trans_annotation_tag.find("a", text="展开阅读全文 ∨")
+                    href_attr = a_tag.get("href")
+                    match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
+                    if match:
+                        number = match.group(1)
+                        string = match.group(2)
+                        full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
+                        soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
+                        t_a_tag = soup_.find("div", class_="contyishang")
+                        full_text = (
+                            t_a_tag.get_text()
+                            .replace("注释", "")
+                            .replace("\n", "")
+                            .replace("\u3000", "")
+                            .replace("\u200b", "")
+                            .replace("▲", "")
+                            .replace(" ", "")
+                            .strip()
+                        )
+                        annotation_text = full_text
+                else:
+                    annotation_text = total_text.replace(" ", "")
+
+        if trans_text and annotation_text:
+            break
+
     poem_details["trans"] = trans_text
     poem_details["annotation"] = annotation_text