Merge pull request #7 from ping/main

upstream merge
holyspiritomb · Sep 25, 2023 · 75e9ac3 · 75e9ac3
2 parents d6b620a + 7c3c9fc
commit 75e9ac3
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 37 deletions.
diff --git a/_recipes.py b/_recipes.py
@@ -39,14 +39,6 @@
 
 # Keep this list in alphabetical order
 recipes: List[Recipe] = [
-    Recipe(
-        recipe="aeon",
-        slug="aeon",
-        src_ext="mobi",
-        target_ext=["epub"],
-        category="Online Magazines",
-        cover_options=CoverOptions(logo_path_or_url="https://aeon.co/logo.png"),
-    ),
     Recipe(
         recipe="asahi-shimbun",
         slug="asahi-shimbun",

diff --git a/recipes/aeon.recipe.py b/recipes/aeon.recipe.py
@@ -1,3 +1,4 @@
+# No longer working becauses css classes are dynamically generated
 import os
 import sys
 

diff --git a/recipes/bloomberg-businessweek.recipe.py b/recipes/bloomberg-businessweek.recipe.py
@@ -39,13 +39,11 @@ class BloombergBusinessweek(BasicNewsRecipe):
     date_format = "%I:%M%p, %-d %b, %Y" if iswindows else "%-I:%M%p, %-d %b, %Y"
     requires_version = (6, 24, 0)  # cos we're using get_url_specific_delay()
 
-    # NOTES: Bot detection kicks in really easily so either:
-    # - limit the number of feeds
-    # - or max_articles_per_feed
-    # - or increase delay
-    delay = 5
+    # NOTES: Bot detection kicks in really easily so if blocked:
+    # - increase delay
+    delay = 14  # not in use since we're using get_url_specific_delay()
+    delay_range = range(12, 16)  # actual delay is a random choice from this
     simultaneous_downloads = 1
-    delay_range = range(5, 10)
     oldest_article = 7
     max_articles_per_feed = 25
 
@@ -87,6 +85,7 @@ class BloombergBusinessweek(BasicNewsRecipe):
         display: block; font-size: 0.8rem; margin-top: 0.2rem;
     }
     .trashline { font-style: italic; }
+    blockquote p { font-size: 1.25rem; margin-left: 0; text-align: center; }
     """
 
     # We send no cookies to avoid triggering bot detection
@@ -116,12 +115,13 @@ def open_novisit(self, *args, **kwargs):
             ),
             ("accept-language", "en,en-US;q=0.5"),
             ("connection", "keep-alive"),
+            ("host", urlparse(target_url).hostname),
+            ("upgrade-insecure-requests", "1"),
             ("user-agent", random_user_agent(0, allow_ie=False)),
         ]
         br.set_handle_redirect(False)
         try:
             res = br.open_novisit(*args, **kwargs)
-            self.download_count += 1
             return res
         except Exception as e:
             is_redirected_to_challenge = False
@@ -140,7 +140,7 @@ def open_novisit(self, *args, **kwargs):
     open = open_novisit
 
     def cleanup(self):
-        if self.download_count <= 1 + (1 if self.masthead_url else 0):
+        if self.download_count <= 0:
             err_msg = "No articles downloaded."
             self.log.warn(err_msg)
             self.abort_recipe_processing(err_msg)
@@ -229,7 +229,9 @@ def render_content(self, content, soup, parent):
                 div.append(img)
                 if photo.get("caption"):
                     caption = soup.new_tag("div", attrs={"class": "caption"})
-                    caption.append(photo["caption"])
+                    caption.append(
+                        BeautifulSoup(photo["caption"], features="html.parser")
+                    )
                     div.append(caption)
                 if photo.get("credit"):
                     credit = soup.new_tag("div", attrs={"class": "credit"})
@@ -285,6 +287,7 @@ def nested_render(self, content, soup, parent):
                 parent.append(content_ele)
 
     def preprocess_raw_html(self, raw_html, url):
+        self.download_count += 1
         article = None
         soup = BeautifulSoup(raw_html)
         for script in soup.find_all(
@@ -294,10 +297,10 @@ def preprocess_raw_html(self, raw_html, url):
                 "data-component-props": ["ArticleBody", "FeatureBody"],
             },
         ):
-            j = json.loads(script.contents[0])
-            if not j.get("story"):
+            article = json.loads(script.contents[0])
+            if not article.get("story"):
+                article = None
                 continue
-            article = j
             break
         if not article:
             script = soup.find(
@@ -361,7 +364,10 @@ def preprocess_raw_html(self, raw_html, url):
         if article.get("byline"):
             soup.find(class_="article-meta").insert(
                 0,
-                BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
+                BeautifulSoup(
+                    f'<span class="author">{article["byline"]}</span>',
+                    features="html.parser",
+                ),
             )
         else:
             try:
@@ -370,7 +376,8 @@ def preprocess_raw_html(self, raw_html, url):
                     soup.find(class_="article-meta").insert(
                         0,
                         BeautifulSoup(
-                            f'<span class="author">{", ".join(post_authors)}</span>'
+                            f'<span class="author">{", ".join(post_authors)}</span>',
+                            features="html.parser",
                         ),
                     )
             except (KeyError, TypeError):
@@ -381,7 +388,8 @@ def preprocess_raw_html(self, raw_html, url):
             soup.body.article.insert(
                 0,
                 BeautifulSoup(
-                    f'<span class="article-section">{" / ".join(categories)}</span>'
+                    f'<span class="article-section">{" / ".join(categories)}</span>',
+                    features="html.parser",
                 ),
             )
         # inject lede image
@@ -395,7 +403,9 @@ def preprocess_raw_html(self, raw_html, url):
                 caption_ele = soup.new_tag(
                     "div", attrs={"class": "news-figure-caption-text"}
                 )
-                caption_ele.append(BeautifulSoup(lede_img_caption_html))
+                caption_ele.append(
+                    BeautifulSoup(lede_img_caption_html, features="html.parser")
+                )
                 img_container.append(caption_ele)
             soup.body.article.append(img_container)
 

diff --git a/recipes/bloomberg-news.recipe.py b/recipes/bloomberg-news.recipe.py
@@ -9,7 +9,9 @@
 import json
 import random
 import re
+import time
 from datetime import datetime, timedelta, timezone
+from pathlib import Path
 from urllib.parse import urlparse
 
 from calibre import browser, iswindows, random_user_agent
@@ -42,11 +44,11 @@ class BloombergNews(BasicNewsRecipe):
     # - limit the number of feeds
     # - or max_articles_per_feed
     # - or increase delay
-    delay = 5
+    delay = 18  # not in use since we're using get_url_specific_delay()
+    delay_range = range(16, 20)  # actual delay is a random choice from this
     simultaneous_downloads = 1
-    delay_range = range(5, 10)
     oldest_article = 1
-    max_articles_per_feed = 25
+    max_articles_per_feed = 15
 
     compress_news_images_auto_size = 8
     bot_blocked = False
@@ -109,6 +111,20 @@ def get_url_specific_delay(self, url):
 
     def open_novisit(self, *args, **kwargs):
         target_url = args[0]
+        for u in ("/videos/", "/audio/"):
+            if u in target_url:
+                self.log.info(f"Skipping multimedia article: {target_url}")
+                self.abort_article(f"Multimedia article. Skipped {target_url}")
+
+        parsed_url = urlparse(target_url)
+        if (
+            parsed_url.hostname == "www.bloomberg.com"
+            and Path(parsed_url.path).suffix.lower() == ".xml"
+        ):
+            # get_url_specific_delay() does not get called for feed urls,
+            # so we'll have to implement the delay here
+            time.sleep(random.choice(self.delay_range))
+
         if self.bot_blocked:
             self.log.warn(f"Block detected. Skipping {target_url}")
             # Abort article without making actual request
@@ -122,12 +138,13 @@ def open_novisit(self, *args, **kwargs):
             ),
             ("accept-language", "en,en-US;q=0.5"),
             ("connection", "keep-alive"),
+            ("host", urlparse(target_url).hostname),
+            ("upgrade-insecure-requests", "1"),
             ("user-agent", random_user_agent(0, allow_ie=False)),
         ]
         br.set_handle_redirect(False)
         try:
             res = br.open_novisit(*args, **kwargs)
-            self.download_count += 1
             return res
         except Exception as e:
             is_redirected_to_challenge = False
@@ -146,7 +163,7 @@ def open_novisit(self, *args, **kwargs):
     open = open_novisit
 
     def cleanup(self):
-        if self.download_count <= len(self.feeds) + (1 if self.masthead_url else 0):
+        if self.download_count <= 0:
             err_msg = "No articles downloaded."
             self.log.warn(err_msg)
             self.abort_recipe_processing(err_msg)
@@ -291,6 +308,7 @@ def nested_render(self, content, soup, parent):
                 parent.append(content_ele)
 
     def preprocess_raw_html(self, raw_html, url):
+        self.download_count += 1
         article = None
         soup = BeautifulSoup(raw_html)
         for script in soup.find_all(
@@ -300,10 +318,10 @@ def preprocess_raw_html(self, raw_html, url):
                 "data-component-props": ["ArticleBody", "FeatureBody"],
             },
         ):
-            j = json.loads(script.contents[0])
-            if not j.get("story"):
+            article = json.loads(script.contents[0])
+            if not article.get("story"):
+                article = None
                 continue
-            article = j
             break
         if not article:
             script = soup.find(
@@ -368,7 +386,10 @@ def preprocess_raw_html(self, raw_html, url):
         if article.get("byline"):
             soup.find(class_="article-meta").insert(
                 0,
-                BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
+                BeautifulSoup(
+                    f'<span class="author">{article["byline"]}</span>',
+                    features="html.parser",
+                ),
             )
         else:
             try:
@@ -377,7 +398,8 @@ def preprocess_raw_html(self, raw_html, url):
                     soup.find(class_="article-meta").insert(
                         0,
                         BeautifulSoup(
-                            f'<span class="author">{", ".join(post_authors)}</span>'
+                            f'<span class="author">{", ".join(post_authors)}</span>',
+                            features="html.parser",
                         ),
                     )
             except (KeyError, TypeError):
@@ -388,7 +410,8 @@ def preprocess_raw_html(self, raw_html, url):
             soup.body.article.insert(
                 0,
                 BeautifulSoup(
-                    f'<span class="article-section">{" / ".join(categories)}</span>'
+                    f'<span class="article-section">{" / ".join(categories)}</span>',
+                    features="html.parser",
                 ),
             )
         # inject lede image
@@ -402,7 +425,9 @@ def preprocess_raw_html(self, raw_html, url):
                 caption_ele = soup.new_tag(
                     "div", attrs={"class": "news-figure-caption-text"}
                 )
-                caption_ele.append(BeautifulSoup(lede_img_caption_html))
+                caption_ele.append(
+                    BeautifulSoup(lede_img_caption_html), features="html.parser"
+                )
                 img_container.append(caption_ele)
             soup.body.article.append(img_container)
 

diff --git a/static/site.js b/static/site.js
@@ -178,7 +178,11 @@ https://opensource.org/licenses/GPL-3.0
             }
         }
         const contents = this.parentElement.parentElement.querySelector(".contents");
+        const tags = this.parentElement.querySelector(".tags");
         this.classList.toggle("is-open");
+        if (tags) {
+            tags.classList.toggle("hide");
+        }
         contents.classList.toggle("hide");   // content
         const publication_id = this.parentElement.dataset["pubId"];
         if (contents.childElementCount <= 0 && RECIPE_DESCRIPTIONS[publication_id] !== undefined) {
@@ -276,9 +280,13 @@ https://opensource.org/licenses/GPL-3.0
                     const periodical = periodicalsEles[i];
                     periodical.classList.remove("hide");
                     const pubDate = periodical.querySelector(".pub-date");
+                    const tags = periodical.querySelector(".tags");
                     if (pubDate) {
                         pubDate.classList.remove("is-open");
                     }
+                    if (tags) {
+                        tags.classList.remove("hide");
+                    }
                     const contents = periodical.querySelector(".contents");
                     if (contents) {
                         contents.classList.add("hide");
@@ -397,9 +405,13 @@ https://opensource.org/licenses/GPL-3.0
                             }
                         }
                         const pubDateEle = periodical.querySelector(".pub-date");
+                        const tags = periodical.querySelector(".tags");
 
                         if (resultsSumm[id]["articles"]) {
                             pubDateEle.classList.add("is-open");
+                            if (tags) {
+                                tags.classList.add("hide");
+                            }
                             if (contentsEle) {
                                 contentsEle.classList.remove("hide");
                                 const positions = resultsSumm[id]["articles"];
@@ -409,6 +421,9 @@ https://opensource.org/licenses/GPL-3.0
                         if (resultsSumm[id]["title"]) {
                             if (!resultsSumm[id]["articles"]) {
                                 pubDateEle.classList.remove("is-open");
+                                if (tags) {
+                                    tags.classList.remove("hide");
+                                }
                                 if (contentsEle) {
                                     contentsEle.classList.add("hide");
                                 }

diff --git a/static/site.scss b/static/site.scss
@@ -373,7 +373,7 @@ footer {
 }
 
 .hide {
-  display: none;
+  display: none !important;
 }
 
 [data-theme="dark"] {
-Original file line number
+Diff line change
@@ Expand Up / @@ -373,7 +373,7 @@ footer { @@
     }
     .hide {
-      display: none;
+      display: none !important;
     }
     [data-theme="dark"] {
@@ Expand Down @@