feat: Supports crawling classical Chinese and fixes the issue that Ch…

…inese translation or annotations are missing
palp1tate · May 23, 2024 · b69bb22 · b69bb22
1 parent 113c8df
commit b69bb22
Show file tree

Hide file tree

Showing 8 changed files with 250 additions and 210 deletions.
diff --git a/.github/semantic.yml b/.github/semantic.yml
@@ -0,0 +1,12 @@
+# Always validate the PR title AND all the commits
+titleAndCommits: true
+# Require at least one commit to be valid
+# this is only relevant when using commitsOnly: true or titleAndCommits: true,
+# which validate all commits by default
+anyCommit: true
+# Allow use of Merge commits (eg on github: "Merge branch 'master' into feature/ride-unicorns")
+# this is only relevant when using commitsOnly: true (or titleAndCommits: true)
+allowMergeCommits: false
+# Allow use of Revert commits (eg on github: "Revert "feat: ride unicorns"")
+# this is only relevant when using commitsOnly: true (or titleAndCommits: true)
+allowRevertCommits: false
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,62 @@
+name: CI/CD
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+jobs:
+  black-check:
+    name: Black Check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4.1.4
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5.1.0
+        with:
+          python-version: "3.10"
+      - name: Install Black
+        run: pip install black
+      - name: Check Python files with Black
+        run: black --check . --exclude ".*pb2.*\.py"
+  install-dependencies:
+    name: Install Python Dependencies
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4.1.4
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v5.1.0
+        with:
+          python-version: "3.10"
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then
+            pip install -r requirements.txt
+          fi
+  release:
+    name: Release
+    runs-on: ubuntu-latest
+    needs:
+      - black-check
+      - install-dependencies
+    if: github.repository == 'palp1tate/fetch_gushiwen' && github.event_name == 'push'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.4
+        with:
+          fetch-depth: -1
+      - name: Setup Node.js
+        uses: actions/setup-node@v4.0.2
+        with:
+          node-version: 20
+      - name: Fetch Previous version
+        id: get-previous-tag
+        uses: actions-ecosystem/action-get-latest-tag@v1.6.0
+      - name: Release
+        run: yarn global add semantic-release@23.1.1 && semantic-release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+
+.idea
+
+*.csv
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+beautifulsoup4==4.11.1
+requests==2.32.2
diff --git a/shige.py b/shige.py
@@ -3,30 +3,30 @@
 from bs4 import BeautifulSoup
 
 
-def fetch_html(url):
+def fetch_html(u):
     try:
-        response = requests.get(url)
+        response = requests.get(u)
         response.raise_for_status()
         return response.text
     except requests.RequestException as e:
         print(f"Error fetching HTML content: {e}")
         return None
 
 
-def extract_poem_urls(html_content):
-    soup = BeautifulSoup(html_content, 'html.parser')
-    poem_urls = []
-
-    for a_tag in soup.find_all('a', href=True):
-        href = a_tag['href']
+def extract_poem_urls(html_detail):
+    soup = BeautifulSoup(html_detail, "html.parser")
+    poems = []
+    for a_tag in soup.find_all("a", href=True):
+        href = a_tag["href"]
         if href.startswith("/shiwenv_"):
-            full_url = f"https://so.gushiwen.cn{href}"
-            poem_urls.append(full_url)
+            poems.append(f"https://so.gushiwen.cn{href}")
+        elif href.startswith("https://so.gushiwen.cn/shiwenv_"):
+            poems.append(href)
 
-    return poem_urls
+    return poems
 
 
-def fetch_poem_details(url):
+def fetch_poem_details(u):
     poem_details = {
         "name": "",
         "author": "",
@@ -35,96 +35,145 @@ def fetch_poem_details(url):
         "trans": "",
         "annotation": "",
         "appreciation": "",
-        "background": ""
+        "background": "",
     }
 
-    soup = BeautifulSoup(fetch_html(url), 'html.parser')
-    title_tag = soup.find('h1')
+    soup = BeautifulSoup(fetch_html(u), "html.parser")
+    title_tag = soup.find("h1")
     if title_tag:
         poem_details["name"] = title_tag.text.strip().replace("\n", "")
 
-    source_tag = soup.find('p', class_='source')
+    source_tag = soup.find("p", class_="source")
     if source_tag:
-        source_info = source_tag.find_all('a')
+        source_info = source_tag.find_all("a")
         if len(source_info) > 0:
             poem_details["author"] = source_info[0].text.strip().replace("\n", "")
-            poem_details["dynasty"] = source_info[1].text.strip().replace("\n", "").replace("〔", "").replace("〕",
-                                                                                                             "")
-
-    content_tag = soup.find('div', class_='contson')
+            poem_details["dynasty"] = (
+                source_info[1]
+                .text.strip()
+                .replace("\n", "")
+                .replace("〔", "")
+                .replace("〕", "")
+                .replace("\u3000", "")
+            )
+
+    content_tag = soup.find("div", class_="contson")
     if content_tag:
-        poem_details["content"] = content_tag.get_text().strip().replace("\n", "")
+        poem_details["content"] = content_tag.get_text().strip().replace("\n", "").replace("\u3000", "")
 
-    trans_annotation_tag = soup.find('div', class_='contyishang')
+    trans_annotation_tag = soup.find("div", class_="contyishang")
+    trans_text = ""
+    annotation_text = ""
     if trans_annotation_tag:
-        p_tags = trans_annotation_tag.find_all('p')
+        p_tags = trans_annotation_tag.find_all("p")
+        total_text = "".join(p.get_text().strip() for p in p_tags).replace("\n", "").replace("\u3000", "")
         for p_tag in p_tags:
-            if '译文' in p_tag.text:
-                poem_details["trans"] = p_tag.get_text().strip().replace("译文", "").replace("\n", "").replace(
-                    "展开阅读全文 ∨", "")
-            elif '注释' in p_tag.text:
-                annotation_text = p_tag.get_text().strip().replace("注释", "").replace("\n", "")
-                if "展开阅读全文 ∨" in annotation_text:
-                    read_more_div = p_tag.find('a', text="展开阅读全文 ∨")
-                    if read_more_div:
-                        href_attr = read_more_div.get('href')
-                        match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
-                        if match:
-                            number = match.group(1)
-                            string = match.group(2)
-                            full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
-                            soup_ = BeautifulSoup(fetch_html(full_text_url), 'html.parser')
-                            paragraphs = soup_.find('div', class_='contyishang').find_all('p')
-                            full_text = "".join(p.get_text().strip() for p in paragraphs).replace(
-                                "\n",
-                                "").replace(
-                                "▲", "")
-                            match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(full_text)
-                            if match:
-                                poem_details["trans"] = match.group(1).strip()
-                                annotation_text = match.group(2).strip()
-                            else:
-                                match = re.compile(r"^韵译(.*?)意译(.*?)注释(.*)$", re.S).search(full_text)
-                                if match:
-                                    poem_details["trans"] = "韵译:" + match.group(
-                                        1).strip() + "意译:" + match.group(2).strip()
-                                    annotation_text = match.group(3).strip()
-                poem_details["annotation"] = annotation_text
-
-    appreciation_divs = soup.find_all('div', class_='contyishang')
-    for div in appreciation_divs:
-        if div.find('h2') and ('赏析' in div.find('h2').text or '鉴赏' in div.find('h2').text):
-            appreciation_paragraphs = div.find_all('p')
-            appreciation_text = "".join(p.get_text().strip() for p in appreciation_paragraphs).replace("\n", "")
-            if "展开阅读全文 ∨" in appreciation_text:
-                read_more_div = div.find('a', text="展开阅读全文 ∨")
+            read_more_div = None
+            if "展开阅读全文 ∨" in total_text:
+                read_more_div = p_tag.find("a", text="展开阅读全文 ∨") if p_tag.find("a",
+                                                                                     text="展开阅读全文 ∨") else read_more_div
                 if read_more_div:
-                    href_attr = read_more_div.get('href')
-                    match = re.search(r"shangxiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
+                    href_attr = read_more_div.get("href")
+                    match = re.search(
+                        r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr
+                    )
                     if match:
                         number = match.group(1)
                         string = match.group(2)
-                        full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxshangxi.aspx?id={number}&idjm={string}"
-                        soup_ = BeautifulSoup(fetch_html(full_text_url), 'html.parser')
-                        paragraphs = soup_.find('div', class_='contyishang').find_all('p')
-                        appreciation_text = "".join(p.get_text().strip() for p in paragraphs).replace("\n",
-                                                                                                      "").replace(
-                            "▲", "")
-            poem_details["appreciation"] += appreciation_text
-
-    background_divs = soup.find_all('div', class_='contyishang')
+                        full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
+                        soup_ = BeautifulSoup(
+                            fetch_html(full_text_url), "html.parser"
+                        )
+                        paragraphs = soup_.find(
+                            "div", class_="contyishang"
+                        ).find_all("p")
+                        full_text = (
+                            "".join(p.get_text().strip() for p in paragraphs)
+                            .replace("\n", "")
+                            .replace("▲", "")
+                            .replace("\u3000", "")
+                        )
+                        match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(full_text)
+                        if match:
+                            trans_text = match.group(1).strip()
+                            annotation_text = match.group(2).strip()
+                        else:
+                            match = re.compile(r"^韵译(.*?)意译(.*?)注释(.*)$", re.S).search(full_text)
+                            if match:
+                                trans_text = (
+                                        "韵译："
+                                        + match.group(1).strip()
+                                        + "意译："
+                                        + match.group(2).strip()
+                                )
+                                annotation_text = match.group(3).strip()
+                    break
+            else:
+                if "译文" in p_tag.text:
+                    trans_text += p_tag.get_text().strip().replace("译文", "").replace("\n", "").replace(
+                        "展开阅读全文 ∨",
+                        "").replace(
+                        "\u3000", "")
+                if "注释" in p_tag.text:
+                    annotation_text += p_tag.get_text().strip().replace("注释", "").replace("\n", "").replace(
+                        "展开阅读全文 ∨", "").replace("\u3000", "")
+    poem_details["trans"] = trans_text
+    poem_details["annotation"] = annotation_text
+
+    appreciation_divs = soup.find_all("div", class_="contyishang")
+    div_tuple_list = []
+    for div in appreciation_divs:
+        label = ""
+        if div.find("h2") and (
+                "赏析" in div.find("h2").text or "鉴赏" in div.find("h2").text or "简析" in div.find("h2").text):
+            label = div.find("h2").text
+        if label:
+            div_tuple_list.append((label, div))
+    for label, div in div_tuple_list:
+        appreciation_paragraphs = div.find_all("p")
+        appreciation_text = "".join(
+            p.get_text().strip() for p in appreciation_paragraphs
+        ).replace("\n", "")
+        if "展开阅读全文 ∨" in appreciation_text:
+            read_more_div = div.find("a", text="展开阅读全文 ∨")
+            if read_more_div:
+                href_attr = read_more_div.get("href")
+                match = re.search(r"shangxiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
+                if match:
+                    number = match.group(1)
+                    string = match.group(2)
+                    full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxshangxi.aspx?id={number}&idjm={string}"
+                    soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
+                    paragraphs = soup_.find("div", class_="contyishang").find_all(
+                        "p"
+                    )
+                    appreciation_text = (
+                        "".join(p.get_text().strip() for p in paragraphs)
+                        .replace("\n", "")
+                        .replace("▲", "")
+                        .replace("\u3000", "")
+                    )
+        if len(div_tuple_list) == 1:
+            poem_details["appreciation"] = appreciation_text
+        elif len(div_tuple_list) > 1:
+            poem_details["appreciation"] += label + "：" + appreciation_text
+
+    background_divs = soup.find_all("div", class_="contyishang")
     for div in background_divs:
-        if div.find('h2') and '创作背景' in div.find('h2').text:
-            background_paragraphs = div.find_all('p')
-            background_text = "".join(p.get_text().strip() for p in background_paragraphs).replace("\n",
-                                                                                                   "")
+        if div.find("h2") and "创作背景" in div.find("h2").text:
+            background_paragraphs = div.find_all("p")
+            background_text = "".join(
+                p.get_text().strip() for p in background_paragraphs
+            ).replace("\n", "").replace("\u3000", "")
             poem_details["background"] = background_text
 
     return poem_details
 
 
 if __name__ == "__main__":
-    url = input("Please enter the URL(example:https://so.gushiwen.cn/gushi/tangshi.aspx): ")
+    url = input(
+        "Please enter the URL(example:https://so.gushiwen.cn/gushi/tangshi.aspx): "
+    )
     poem_urls = []
     html_content = fetch_html(url)
     if html_content: