Skip to content

Commit

Permalink
feat: Supports crawling classical Chinese and fixes the issue that Ch…
Browse files Browse the repository at this point in the history
…inese translation or annotations are missing
  • Loading branch information
palp1tate committed May 23, 2024
1 parent 113c8df commit b69bb22
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 210 deletions.
12 changes: 12 additions & 0 deletions .github/semantic.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Always validate the PR title AND all the commits
titleAndCommits: true
# Require at least one commit to be valid
# this is only relevant when using commitsOnly: true or titleAndCommits: true,
# which validate all commits by default
anyCommit: true
# Allow use of Merge commits (eg on github: "Merge branch 'master' into feature/ride-unicorns")
# this is only relevant when using commitsOnly: true (or titleAndCommits: true)
allowMergeCommits: false
# Allow use of Revert commits (eg on github: "Revert "feat: ride unicorns"")
# this is only relevant when using commitsOnly: true (or titleAndCommits: true)
allowRevertCommits: false
62 changes: 62 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
name: CI/CD
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
black-check:
name: Black Check
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v4.1.4
- name: Set up Python 3.10
uses: actions/setup-python@v5.1.0
with:
python-version: "3.10"
- name: Install Black
run: pip install black
- name: Check Python files with Black
run: black --check . --exclude ".*pb2.*\.py"
install-dependencies:
name: Install Python Dependencies
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v4.1.4
- name: Set up Python 3.10
uses: actions/setup-python@v5.1.0
with:
python-version: "3.10"
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then
pip install -r requirements.txt
fi
release:
name: Release
runs-on: ubuntu-latest
needs:
- black-check
- install-dependencies
if: github.repository == 'palp1tate/fetch_gushiwen' && github.event_name == 'push'
steps:
- name: Checkout
uses: actions/checkout@v4.1.4
with:
fetch-depth: -1
- name: Setup Node.js
uses: actions/setup-node@v4.0.2
with:
node-version: 20
- name: Fetch Previous version
id: get-previous-tag
uses: actions-ecosystem/action-get-latest-tag@v1.6.0
- name: Release
run: yarn global add semantic-release@23.1.1 && semantic-release
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

.idea

*.csv
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
beautifulsoup4==4.11.1
requests==2.32.2
207 changes: 128 additions & 79 deletions shige.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,30 @@
from bs4 import BeautifulSoup


def fetch_html(url):
def fetch_html(u):
try:
response = requests.get(url)
response = requests.get(u)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching HTML content: {e}")
return None


def extract_poem_urls(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
poem_urls = []

for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
def extract_poem_urls(html_detail):
soup = BeautifulSoup(html_detail, "html.parser")
poems = []
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if href.startswith("/shiwenv_"):
full_url = f"https://so.gushiwen.cn{href}"
poem_urls.append(full_url)
poems.append(f"https://so.gushiwen.cn{href}")
elif href.startswith("https://so.gushiwen.cn/shiwenv_"):
poems.append(href)

return poem_urls
return poems


def fetch_poem_details(url):
def fetch_poem_details(u):
poem_details = {
"name": "",
"author": "",
Expand All @@ -35,96 +35,145 @@ def fetch_poem_details(url):
"trans": "",
"annotation": "",
"appreciation": "",
"background": ""
"background": "",
}

soup = BeautifulSoup(fetch_html(url), 'html.parser')
title_tag = soup.find('h1')
soup = BeautifulSoup(fetch_html(u), "html.parser")
title_tag = soup.find("h1")
if title_tag:
poem_details["name"] = title_tag.text.strip().replace("\n", "")

source_tag = soup.find('p', class_='source')
source_tag = soup.find("p", class_="source")
if source_tag:
source_info = source_tag.find_all('a')
source_info = source_tag.find_all("a")
if len(source_info) > 0:
poem_details["author"] = source_info[0].text.strip().replace("\n", "")
poem_details["dynasty"] = source_info[1].text.strip().replace("\n", "").replace("〔", "").replace("〕",
"")

content_tag = soup.find('div', class_='contson')
poem_details["dynasty"] = (
source_info[1]
.text.strip()
.replace("\n", "")
.replace("〔", "")
.replace("〕", "")
.replace("\u3000", "")
)

content_tag = soup.find("div", class_="contson")
if content_tag:
poem_details["content"] = content_tag.get_text().strip().replace("\n", "")
poem_details["content"] = content_tag.get_text().strip().replace("\n", "").replace("\u3000", "")

trans_annotation_tag = soup.find('div', class_='contyishang')
trans_annotation_tag = soup.find("div", class_="contyishang")
trans_text = ""
annotation_text = ""
if trans_annotation_tag:
p_tags = trans_annotation_tag.find_all('p')
p_tags = trans_annotation_tag.find_all("p")
total_text = "".join(p.get_text().strip() for p in p_tags).replace("\n", "").replace("\u3000", "")
for p_tag in p_tags:
if '译文' in p_tag.text:
poem_details["trans"] = p_tag.get_text().strip().replace("译文", "").replace("\n", "").replace(
"展开阅读全文 ∨", "")
elif '注释' in p_tag.text:
annotation_text = p_tag.get_text().strip().replace("注释", "").replace("\n", "")
if "展开阅读全文 ∨" in annotation_text:
read_more_div = p_tag.find('a', text="展开阅读全文 ∨")
if read_more_div:
href_attr = read_more_div.get('href')
match = re.search(r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), 'html.parser')
paragraphs = soup_.find('div', class_='contyishang').find_all('p')
full_text = "".join(p.get_text().strip() for p in paragraphs).replace(
"\n",
"").replace(
"▲", "")
match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(full_text)
if match:
poem_details["trans"] = match.group(1).strip()
annotation_text = match.group(2).strip()
else:
match = re.compile(r"^韵译(.*?)意译(.*?)注释(.*)$", re.S).search(full_text)
if match:
poem_details["trans"] = "韵译:" + match.group(
1).strip() + "意译:" + match.group(2).strip()
annotation_text = match.group(3).strip()
poem_details["annotation"] = annotation_text

appreciation_divs = soup.find_all('div', class_='contyishang')
for div in appreciation_divs:
if div.find('h2') and ('赏析' in div.find('h2').text or '鉴赏' in div.find('h2').text):
appreciation_paragraphs = div.find_all('p')
appreciation_text = "".join(p.get_text().strip() for p in appreciation_paragraphs).replace("\n", "")
if "展开阅读全文 ∨" in appreciation_text:
read_more_div = div.find('a', text="展开阅读全文 ∨")
read_more_div = None
if "展开阅读全文 ∨" in total_text:
read_more_div = p_tag.find("a", text="展开阅读全文 ∨") if p_tag.find("a",
text="展开阅读全文 ∨") else read_more_div
if read_more_div:
href_attr = read_more_div.get('href')
match = re.search(r"shangxiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
href_attr = read_more_div.get("href")
match = re.search(
r"fanyiShow\((\d+),'([A-Z0-9]+)'\)", href_attr
)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxshangxi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), 'html.parser')
paragraphs = soup_.find('div', class_='contyishang').find_all('p')
appreciation_text = "".join(p.get_text().strip() for p in paragraphs).replace("\n",
"").replace(
"▲", "")
poem_details["appreciation"] += appreciation_text

background_divs = soup.find_all('div', class_='contyishang')
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxfanyi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(
fetch_html(full_text_url), "html.parser"
)
paragraphs = soup_.find(
"div", class_="contyishang"
).find_all("p")
full_text = (
"".join(p.get_text().strip() for p in paragraphs)
.replace("\n", "")
.replace("▲", "")
.replace("\u3000", "")
)
match = re.compile(r"^译文(.*?)注释(.*)$", re.S).search(full_text)
if match:
trans_text = match.group(1).strip()
annotation_text = match.group(2).strip()
else:
match = re.compile(r"^韵译(.*?)意译(.*?)注释(.*)$", re.S).search(full_text)
if match:
trans_text = (
"韵译:"
+ match.group(1).strip()
+ "意译:"
+ match.group(2).strip()
)
annotation_text = match.group(3).strip()
break
else:
if "译文" in p_tag.text:
trans_text += p_tag.get_text().strip().replace("译文", "").replace("\n", "").replace(
"展开阅读全文 ∨",
"").replace(
"\u3000", "")
if "注释" in p_tag.text:
annotation_text += p_tag.get_text().strip().replace("注释", "").replace("\n", "").replace(
"展开阅读全文 ∨", "").replace("\u3000", "")
poem_details["trans"] = trans_text
poem_details["annotation"] = annotation_text

appreciation_divs = soup.find_all("div", class_="contyishang")
div_tuple_list = []
for div in appreciation_divs:
label = ""
if div.find("h2") and (
"赏析" in div.find("h2").text or "鉴赏" in div.find("h2").text or "简析" in div.find("h2").text):
label = div.find("h2").text
if label:
div_tuple_list.append((label, div))
for label, div in div_tuple_list:
appreciation_paragraphs = div.find_all("p")
appreciation_text = "".join(
p.get_text().strip() for p in appreciation_paragraphs
).replace("\n", "")
if "展开阅读全文 ∨" in appreciation_text:
read_more_div = div.find("a", text="展开阅读全文 ∨")
if read_more_div:
href_attr = read_more_div.get("href")
match = re.search(r"shangxiShow\((\d+),'([A-Z0-9]+)'\)", href_attr)
if match:
number = match.group(1)
string = match.group(2)
full_text_url = f"https://so.gushiwen.cn/nocdn/ajaxshangxi.aspx?id={number}&idjm={string}"
soup_ = BeautifulSoup(fetch_html(full_text_url), "html.parser")
paragraphs = soup_.find("div", class_="contyishang").find_all(
"p"
)
appreciation_text = (
"".join(p.get_text().strip() for p in paragraphs)
.replace("\n", "")
.replace("▲", "")
.replace("\u3000", "")
)
if len(div_tuple_list) == 1:
poem_details["appreciation"] = appreciation_text
elif len(div_tuple_list) > 1:
poem_details["appreciation"] += label + ":" + appreciation_text

background_divs = soup.find_all("div", class_="contyishang")
for div in background_divs:
if div.find('h2') and '创作背景' in div.find('h2').text:
background_paragraphs = div.find_all('p')
background_text = "".join(p.get_text().strip() for p in background_paragraphs).replace("\n",
"")
if div.find("h2") and "创作背景" in div.find("h2").text:
background_paragraphs = div.find_all("p")
background_text = "".join(
p.get_text().strip() for p in background_paragraphs
).replace("\n", "").replace("\u3000", "")
poem_details["background"] = background_text

return poem_details


if __name__ == "__main__":
url = input("Please enter the URL(example:https://so.gushiwen.cn/gushi/tangshi.aspx): ")
url = input(
"Please enter the URL(example:https://so.gushiwen.cn/gushi/tangshi.aspx): "
)
poem_urls = []
html_content = fetch_html(url)
if html_content:
Expand Down
Loading

0 comments on commit b69bb22

Please sign in to comment.