Skip to content

Commit

Permalink
Merge pull request #7 from ping/main
Browse files Browse the repository at this point in the history
upstream merge
  • Loading branch information
holyspiritomb authored Sep 25, 2023
2 parents d6b620a + 7c3c9fc commit 75e9ac3
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 37 deletions.
8 changes: 0 additions & 8 deletions _recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,6 @@

# Keep this list in alphabetical order
recipes: List[Recipe] = [
Recipe(
recipe="aeon",
slug="aeon",
src_ext="mobi",
target_ext=["epub"],
category="Online Magazines",
cover_options=CoverOptions(logo_path_or_url="https://aeon.co/logo.png"),
),
Recipe(
recipe="asahi-shimbun",
slug="asahi-shimbun",
Expand Down
1 change: 1 addition & 0 deletions recipes/aeon.recipe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# No longer working becauses css classes are dynamically generated
import os
import sys

Expand Down
42 changes: 26 additions & 16 deletions recipes/bloomberg-businessweek.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,11 @@ class BloombergBusinessweek(BasicNewsRecipe):
date_format = "%I:%M%p, %-d %b, %Y" if iswindows else "%-I:%M%p, %-d %b, %Y"
requires_version = (6, 24, 0) # cos we're using get_url_specific_delay()

# NOTES: Bot detection kicks in really easily so either:
# - limit the number of feeds
# - or max_articles_per_feed
# - or increase delay
delay = 5
# NOTES: Bot detection kicks in really easily so if blocked:
# - increase delay
delay = 14 # not in use since we're using get_url_specific_delay()
delay_range = range(12, 16) # actual delay is a random choice from this
simultaneous_downloads = 1
delay_range = range(5, 10)
oldest_article = 7
max_articles_per_feed = 25

Expand Down Expand Up @@ -87,6 +85,7 @@ class BloombergBusinessweek(BasicNewsRecipe):
display: block; font-size: 0.8rem; margin-top: 0.2rem;
}
.trashline { font-style: italic; }
blockquote p { font-size: 1.25rem; margin-left: 0; text-align: center; }
"""

# We send no cookies to avoid triggering bot detection
Expand Down Expand Up @@ -116,12 +115,13 @@ def open_novisit(self, *args, **kwargs):
),
("accept-language", "en,en-US;q=0.5"),
("connection", "keep-alive"),
("host", urlparse(target_url).hostname),
("upgrade-insecure-requests", "1"),
("user-agent", random_user_agent(0, allow_ie=False)),
]
br.set_handle_redirect(False)
try:
res = br.open_novisit(*args, **kwargs)
self.download_count += 1
return res
except Exception as e:
is_redirected_to_challenge = False
Expand All @@ -140,7 +140,7 @@ def open_novisit(self, *args, **kwargs):
open = open_novisit

def cleanup(self):
if self.download_count <= 1 + (1 if self.masthead_url else 0):
if self.download_count <= 0:
err_msg = "No articles downloaded."
self.log.warn(err_msg)
self.abort_recipe_processing(err_msg)
Expand Down Expand Up @@ -229,7 +229,9 @@ def render_content(self, content, soup, parent):
div.append(img)
if photo.get("caption"):
caption = soup.new_tag("div", attrs={"class": "caption"})
caption.append(photo["caption"])
caption.append(
BeautifulSoup(photo["caption"], features="html.parser")
)
div.append(caption)
if photo.get("credit"):
credit = soup.new_tag("div", attrs={"class": "credit"})
Expand Down Expand Up @@ -285,6 +287,7 @@ def nested_render(self, content, soup, parent):
parent.append(content_ele)

def preprocess_raw_html(self, raw_html, url):
self.download_count += 1
article = None
soup = BeautifulSoup(raw_html)
for script in soup.find_all(
Expand All @@ -294,10 +297,10 @@ def preprocess_raw_html(self, raw_html, url):
"data-component-props": ["ArticleBody", "FeatureBody"],
},
):
j = json.loads(script.contents[0])
if not j.get("story"):
article = json.loads(script.contents[0])
if not article.get("story"):
article = None
continue
article = j
break
if not article:
script = soup.find(
Expand Down Expand Up @@ -361,7 +364,10 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
BeautifulSoup(
f'<span class="author">{article["byline"]}</span>',
features="html.parser",
),
)
else:
try:
Expand All @@ -370,7 +376,8 @@ def preprocess_raw_html(self, raw_html, url):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
f'<span class="author">{", ".join(post_authors)}</span>'
f'<span class="author">{", ".join(post_authors)}</span>',
features="html.parser",
),
)
except (KeyError, TypeError):
Expand All @@ -381,7 +388,8 @@ def preprocess_raw_html(self, raw_html, url):
soup.body.article.insert(
0,
BeautifulSoup(
f'<span class="article-section">{" / ".join(categories)}</span>'
f'<span class="article-section">{" / ".join(categories)}</span>',
features="html.parser",
),
)
# inject lede image
Expand All @@ -395,7 +403,9 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
caption_ele.append(BeautifulSoup(lede_img_caption_html))
caption_ele.append(
BeautifulSoup(lede_img_caption_html, features="html.parser")
)
img_container.append(caption_ele)
soup.body.article.append(img_container)

Expand Down
49 changes: 37 additions & 12 deletions recipes/bloomberg-news.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
import json
import random
import re
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from urllib.parse import urlparse

from calibre import browser, iswindows, random_user_agent
Expand Down Expand Up @@ -42,11 +44,11 @@ class BloombergNews(BasicNewsRecipe):
# - limit the number of feeds
# - or max_articles_per_feed
# - or increase delay
delay = 5
delay = 18 # not in use since we're using get_url_specific_delay()
delay_range = range(16, 20) # actual delay is a random choice from this
simultaneous_downloads = 1
delay_range = range(5, 10)
oldest_article = 1
max_articles_per_feed = 25
max_articles_per_feed = 15

compress_news_images_auto_size = 8
bot_blocked = False
Expand Down Expand Up @@ -109,6 +111,20 @@ def get_url_specific_delay(self, url):

def open_novisit(self, *args, **kwargs):
target_url = args[0]
for u in ("/videos/", "/audio/"):
if u in target_url:
self.log.info(f"Skipping multimedia article: {target_url}")
self.abort_article(f"Multimedia article. Skipped {target_url}")

parsed_url = urlparse(target_url)
if (
parsed_url.hostname == "www.bloomberg.com"
and Path(parsed_url.path).suffix.lower() == ".xml"
):
# get_url_specific_delay() does not get called for feed urls,
# so we'll have to implement the delay here
time.sleep(random.choice(self.delay_range))

if self.bot_blocked:
self.log.warn(f"Block detected. Skipping {target_url}")
# Abort article without making actual request
Expand All @@ -122,12 +138,13 @@ def open_novisit(self, *args, **kwargs):
),
("accept-language", "en,en-US;q=0.5"),
("connection", "keep-alive"),
("host", urlparse(target_url).hostname),
("upgrade-insecure-requests", "1"),
("user-agent", random_user_agent(0, allow_ie=False)),
]
br.set_handle_redirect(False)
try:
res = br.open_novisit(*args, **kwargs)
self.download_count += 1
return res
except Exception as e:
is_redirected_to_challenge = False
Expand All @@ -146,7 +163,7 @@ def open_novisit(self, *args, **kwargs):
open = open_novisit

def cleanup(self):
if self.download_count <= len(self.feeds) + (1 if self.masthead_url else 0):
if self.download_count <= 0:
err_msg = "No articles downloaded."
self.log.warn(err_msg)
self.abort_recipe_processing(err_msg)
Expand Down Expand Up @@ -291,6 +308,7 @@ def nested_render(self, content, soup, parent):
parent.append(content_ele)

def preprocess_raw_html(self, raw_html, url):
self.download_count += 1
article = None
soup = BeautifulSoup(raw_html)
for script in soup.find_all(
Expand All @@ -300,10 +318,10 @@ def preprocess_raw_html(self, raw_html, url):
"data-component-props": ["ArticleBody", "FeatureBody"],
},
):
j = json.loads(script.contents[0])
if not j.get("story"):
article = json.loads(script.contents[0])
if not article.get("story"):
article = None
continue
article = j
break
if not article:
script = soup.find(
Expand Down Expand Up @@ -368,7 +386,10 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
BeautifulSoup(
f'<span class="author">{article["byline"]}</span>',
features="html.parser",
),
)
else:
try:
Expand All @@ -377,7 +398,8 @@ def preprocess_raw_html(self, raw_html, url):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
f'<span class="author">{", ".join(post_authors)}</span>'
f'<span class="author">{", ".join(post_authors)}</span>',
features="html.parser",
),
)
except (KeyError, TypeError):
Expand All @@ -388,7 +410,8 @@ def preprocess_raw_html(self, raw_html, url):
soup.body.article.insert(
0,
BeautifulSoup(
f'<span class="article-section">{" / ".join(categories)}</span>'
f'<span class="article-section">{" / ".join(categories)}</span>',
features="html.parser",
),
)
# inject lede image
Expand All @@ -402,7 +425,9 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
caption_ele.append(BeautifulSoup(lede_img_caption_html))
caption_ele.append(
BeautifulSoup(lede_img_caption_html), features="html.parser"
)
img_container.append(caption_ele)
soup.body.article.append(img_container)

Expand Down
15 changes: 15 additions & 0 deletions static/site.js
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ https://opensource.org/licenses/GPL-3.0
}
}
const contents = this.parentElement.parentElement.querySelector(".contents");
const tags = this.parentElement.querySelector(".tags");
this.classList.toggle("is-open");
if (tags) {
tags.classList.toggle("hide");
}
contents.classList.toggle("hide"); // content
const publication_id = this.parentElement.dataset["pubId"];
if (contents.childElementCount <= 0 && RECIPE_DESCRIPTIONS[publication_id] !== undefined) {
Expand Down Expand Up @@ -276,9 +280,13 @@ https://opensource.org/licenses/GPL-3.0
const periodical = periodicalsEles[i];
periodical.classList.remove("hide");
const pubDate = periodical.querySelector(".pub-date");
const tags = periodical.querySelector(".tags");
if (pubDate) {
pubDate.classList.remove("is-open");
}
if (tags) {
tags.classList.remove("hide");
}
const contents = periodical.querySelector(".contents");
if (contents) {
contents.classList.add("hide");
Expand Down Expand Up @@ -397,9 +405,13 @@ https://opensource.org/licenses/GPL-3.0
}
}
const pubDateEle = periodical.querySelector(".pub-date");
const tags = periodical.querySelector(".tags");

if (resultsSumm[id]["articles"]) {
pubDateEle.classList.add("is-open");
if (tags) {
tags.classList.add("hide");
}
if (contentsEle) {
contentsEle.classList.remove("hide");
const positions = resultsSumm[id]["articles"];
Expand All @@ -409,6 +421,9 @@ https://opensource.org/licenses/GPL-3.0
if (resultsSumm[id]["title"]) {
if (!resultsSumm[id]["articles"]) {
pubDateEle.classList.remove("is-open");
if (tags) {
tags.classList.remove("hide");
}
if (contentsEle) {
contentsEle.classList.add("hide");
}
Expand Down
2 changes: 1 addition & 1 deletion static/site.scss
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ footer {
}

.hide {
display: none;
display: none !important;
}

[data-theme="dark"] {
Expand Down

0 comments on commit 75e9ac3

Please sign in to comment.