From d02cb24c160311ce12de0900d98c18b39e00f25c Mon Sep 17 00:00:00 2001
From: raileo98 <164594063+raileo98@users.noreply.github.com>
Date: Tue, 22 Oct 2024 03:11:48 +0000
Subject: [PATCH] Update rthk.py
---
code/rthk.py | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/code/rthk.py b/code/rthk.py
index 5acbb14c7..71047a841 100644
--- a/code/rthk.py
+++ b/code/rthk.py
@@ -147,11 +147,11 @@ def parse_pub_date(date_str):
def get_item_pub_date(item):
pub_date = item.find('pubDate')
if pub_date:
- return pub_date.text
+ return pub_date.text.strip()
published = item.find('published')
if published:
- return published.text
+ return published.text.strip()
return None
@@ -214,7 +214,7 @@ async def process_category(category, url):
try:
response = await get_response(url)
if response.ok:
- web_content = response.text
+ web_content = response.text.strip()
else:
print(f'{category} 處理失敗,即將重試!')
logging.error(f'{category} 處理失敗,HTTP 狀態碼: {response.status_code}')
@@ -261,7 +261,7 @@ async def process_category(category, url):
item.description.string = CData(html.unescape(item.description.string.strip()))
if soup_rss.find('url') is not None:
- soup_rss.find('url').string = CData(html.unescape(soup_rss.find('url').string))
+ soup_rss.find('url').string = CData(html.unescape(soup_rss.find('url').string.strip()))
sorted_items = sorted(soup_rss.find_all('item'), key=lambda x: datetime.strptime(get_item_pub_date(x), '%a, %d %b %Y %H:%M:%S %z') if get_item_pub_date(x) else datetime.min, reverse=True)
@@ -279,7 +279,7 @@ async def process_category(category, url):
tag.decompose()
async with aiofiles.open(rss_filename, 'w', encoding='utf-8') as file:
- await file.write(soup_rss.prettify().rstrip())
+ await file.write(soup_rss.prettify().strip())
print(f'{category} 處理完成!')
@@ -288,17 +288,17 @@ async def process_article(fg, category, article):
try:
fe = fg.add_entry()
- articleTitle = article.select_one('.ns2-title').text
+ articleTitle = article.select_one('.ns2-title').text.strip()
articleLink = article.select_one('.ns2-title a')['href']
- articleLink = articleLink.replace('?spTabChangeable=0', '')
+ articleLink = articleLink.replace('?spTabChangeable=0', '').strip()
print( f'{articleTitle} started!' )
article_response = await get_response(articleLink)
- article_content = article_response.text
+ article_content = article_response.text.strip()
article_soup = BeautifulSoup(article_content, 'lxml')
- feedDescription = article_soup.select_one('.itemFullText').prettify()
+ feedDescription = article_soup.select_one('.itemFullText').prettify().strip()
# 處理圖片
images = article_soup.select('.items_content .imgPhotoAfterLoad')
@@ -374,12 +374,12 @@ async def process_article(fg, category, article):
# 緩存圖片
await asyncio.gather(*(cache_image(imageUrl) for imageUrl in imgList))
- pub_date = article.select_one('.ns2-created').text
+ pub_date = article.select_one('.ns2-created').text.strip()
formatted_pub_date = parse_pub_date(pub_date)
feedDescription = f'{imgHtml}
{feedDescription}
原始網址 Original URL:{articleLink}
© rthk.hk
電子郵件 Email: cnews@rthk.hk
' - feedDescription = BeautifulSoup(feedDescription, 'lxml').prettify() + feedDescription = BeautifulSoup(feedDescription, 'lxml').prettify().strip() fe.title(articleTitle) fe.link(href=articleLink)