pittcsc · tianyizheng02 · Dec 24, 2024 · Dec 25, 2024 · Dec 29, 2024 · RitwikGupta
diff --git a/pittapi/news.py b/pittapi/news.py
@@ -19,50 +19,20 @@
 
 from __future__ import annotations
 
+from functools import cache
 import math
 from requests_html import Element, HTMLResponse, HTMLSession
-from typing import Literal, NamedTuple
+from typing import NamedTuple
 
 NUM_ARTICLES_PER_PAGE = 20
 
-NEWS_BY_CATEGORY_URL = (
-    "https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
+PITT_BASE_URL = "https://www.pitt.edu"
+PITTWIRE_URL = PITT_BASE_URL + "/pittwire"
+FEATURES_ARTICLES_URL = PITTWIRE_URL + "/news/features-articles"
+NEWS_BY_CATEGORY_URL = PITTWIRE_URL + (
+    "/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
     "&title={query}&field_category_target_id=All&page={page_num}"
 )
-PITT_BASE_URL = "https://www.pitt.edu"
-
-Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
-Topic = Literal[
-    "university-news",
-    "health-and-wellness",
-    "technology-and-science",
-    "arts-and-humanities",
-    "community-impact",
-    "innovation-and-research",
-    "global",
-    "diversity-equity-and-inclusion",
-    "our-city-our-campus",
-    "teaching-and-learning",
-    "space",
-    "ukraine",
-    "sustainability",
-]
-
-TOPIC_ID_MAP: dict[Topic, int] = {
-    "university-news": 432,
-    "health-and-wellness": 2,
-    "technology-and-science": 391,
-    "arts-and-humanities": 4,
-    "community-impact": 6,
-    "innovation-and-research": 1,
-    "global": 9,
-    "diversity-equity-and-inclusion": 8,
-    "our-city-our-campus": 12,
-    "teaching-and-learning": 7,
-    "space": 440,
-    "ukraine": 441,
-    "sustainability": 470,
-}
 
 sess = HTMLSession()
 
@@ -87,18 +57,51 @@ def from_html(cls, article_html: Element) -> Article:
         return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)
 
 
-def _get_page_articles(
-    topic: Topic,
-    category: Category,
-    query: str,
-    year: int | None,
-    page_num: int,
-) -> list[Article]:
+@cache
+def _scrape_categories() -> dict[str, str]:
+    response: HTMLResponse = sess.get(PITTWIRE_URL)
+    category_menu: Element = response.html.find("div#block-views-block-category-menu-category-menu", first=True)
+    category_list: list[Element] = category_menu.find("ul.hamburger-menu-list li")
+    category_map: dict[str, str] = {}
+    for category in category_list:
+        category_link: Element = category.find("a", first=True)
+        category_url_name = category_link.attrs["href"].split("/")[-1]
+        category_map[category.text.strip()] = category_url_name
+    if not category_map:
+        raise RuntimeError("No categories found, please open a GitHub issue")
+    return category_map
+
+
+@cache
+def _scrape_topics() -> dict[str, int]:
+    response: HTMLResponse = sess.get(FEATURES_ARTICLES_URL)
+    main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
+    topic_fieldset: Element = main_content.find("fieldset.form-item-field-topics-target-id", first=True)
+    topic_options: list[Element] = topic_fieldset.find("option")
+    topic_map: dict[str, int] = {}
+    for topic_option in topic_options:
+        if (topic_id := topic_option.attrs["value"].strip()) == "All":  # Skip placeholder "Topics" option
+            continue
+        topic_name = topic_option.text.strip()
+        topic_map[topic_name] = int(topic_id)
+    if not topic_map:
+        raise RuntimeError("No topics found, please open a GitHub issue")
+    return topic_map
+
+
+def _get_page_articles(topic: str, category: str, query: str, year: int | None, page_num: int) -> list[Article]:
+    topic_id_map = _scrape_topics()
+    category_url_name_map = _scrape_categories()
     year_str = str(year) if year else ""
     page_num_str = str(page_num) if page_num else ""
+
     response: HTMLResponse = sess.get(
         NEWS_BY_CATEGORY_URL.format(
-            category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
+            category=category_url_name_map[category],
+            topic_id=topic_id_map[topic],
+            year=year_str,
+            query=query,
+            page_num=page_num_str,
         )
     )
     main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
@@ -107,13 +110,33 @@ def _get_page_articles(
     return page_articles
 
 
+@cache
+def get_categories() -> list[str]:
+    category_url_name_map = _scrape_categories()
+    return list(category_url_name_map.keys())
+
+
+@cache
+def get_topics() -> list[str]:
+    topic_id_map = _scrape_topics()
+    return list(topic_id_map.keys())
+
+
 def get_articles_by_topic(
-    topic: Topic,
-    category: Category = "features-articles",
+    topic: str,
+    category: str = "Features & Articles",
     query: str = "",
     year: int | None = None,
     max_num_results: int = NUM_ARTICLES_PER_PAGE,
 ) -> list[Article]:
+    topic_id_map = _scrape_topics()
+    if topic not in topic_id_map:
+        raise ValueError(f"'{topic}' is not a valid topic, must be one of the following: {get_topics()}")
+
+    category_url_name_map = _scrape_categories()
+    if category not in category_url_name_map:
+        raise ValueError(f"'{category}' is not a valid category, must be one of the following: {get_categories()}")
+
     num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)
 
     # Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order

diff --git a/tests/news_test.py b/tests/news_test.py
@@ -30,6 +30,14 @@
 class NewsTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         unittest.TestCase.__init__(self, *args, **kwargs)
+        with (SAMPLE_PATH / "news_pittwire.html").open() as f:
+            self.pittwire = f.read()
+        with (SAMPLE_PATH / "news_pittwire_no_categories.html").open() as f:
+            self.pittwire_no_categories = f.read()
+        with (SAMPLE_PATH / "news_features_articles.html").open() as f:
+            self.features_articles = f.read()
+        with (SAMPLE_PATH / "news_features_articles_no_topics.html").open() as f:
+            self.features_articles_no_topics = f.read()
         with (SAMPLE_PATH / "news_university_news_features_articles_page_0.html").open() as f:
             self.university_news_features_articles_page_0 = f.read()
         with (SAMPLE_PATH / "news_university_news_features_articles_page_1.html").open() as f:
@@ -39,16 +47,77 @@ def __init__(self, *args, **kwargs):
         with (SAMPLE_PATH / "news_university_news_features_articles_2020.html").open() as f:
             self.university_news_features_articles_2020 = f.read()
 
+    @responses.activate
+    def test_get_categories(self):
+        news.get_categories.cache_clear()
+        news._scrape_categories.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+
+        categories = news.get_categories()
+
+        self.assertCountEqual(
+            categories, ["Features & Articles", "Accolades & Honors", "Ones to Watch", "Announcements and Updates"]
+        )
+
+    @responses.activate
+    def test_get_categories_missing(self):
+        news.get_categories.cache_clear()
+        news._scrape_categories.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire_no_categories)
+
+        self.assertRaises(RuntimeError, news.get_categories)
+
+    @responses.activate
+    def test_get_topics(self):
+        news.get_topics.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+        topics = news.get_topics()
+
+        self.assertCountEqual(
+            topics,
+            [
+                "University News",
+                "Health and Wellness",
+                "Technology & Science",
+                "Arts and Humanities",
+                "Community Impact",
+                "Innovation and Research",
+                "Global",
+                "Diversity, Equity, and Inclusion",
+                "Our City/Our Campus",
+                "Teaching & Learning",
+                "Space",
+                "Ukraine",
+                "Sustainability",
+            ],
+        )
+
+    @responses.activate
+    def test_get_topics_missing(self):
+        news.get_topics.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles_no_topics)
+
+        self.assertRaises(RuntimeError, news.get_topics)
+
     @responses.activate
     def test_get_articles_by_topic(self):
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
             "&field_category_target_id=All",
             body=self.university_news_features_articles_page_0,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news")
+        university_news_articles = news.get_articles_by_topic("University News")
 
         self.assertEqual(len(university_news_articles), news.NUM_ARTICLES_PER_PAGE)
         self.assertEqual(
@@ -75,14 +144,20 @@ def test_get_articles_by_topic(self):
     @responses.activate
     def test_get_articles_by_topic_query(self):
         query = "fulbright"
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value="
             f"&title={query}&field_category_target_id=All",
             body=self.university_news_features_articles_fulbright,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", query=query)
+        university_news_articles = news.get_articles_by_topic("University News", query=query)
 
         self.assertEqual(len(university_news_articles), 3)
         self.assertEqual(
@@ -115,14 +190,20 @@ def test_get_articles_by_topic_query(self):
     @responses.activate
     def test_get_articles_by_topic_year(self):
         year = 2020
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             f"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value={year}"
             "&title=&field_category_target_id=All",
             body=self.university_news_features_articles_2020,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", year=year)
+        university_news_articles = news.get_articles_by_topic("University News", year=year)
 
         self.assertEqual(len(university_news_articles), 5)
         self.assertEqual(
@@ -152,14 +233,20 @@ def test_get_articles_by_topic_year(self):
     @responses.activate
     def test_get_articles_by_topic_less_than_one_page(self):
         num_results = 5
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
             "&field_category_target_id=All",
             body=self.university_news_features_articles_page_0,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
+        university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)
 
         self.assertEqual(len(university_news_articles), num_results)
         self.assertEqual(
@@ -186,6 +273,12 @@ def test_get_articles_by_topic_less_than_one_page(self):
     @responses.activate
     def test_get_articles_by_topic_multiple_pages(self):
         num_results = news.NUM_ARTICLES_PER_PAGE + 5
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
         responses.add(
             responses.GET,
             "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
@@ -199,7 +292,7 @@ def test_get_articles_by_topic_multiple_pages(self):
             body=self.university_news_features_articles_page_1,
         )
 
-        university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
+        university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)
 
         self.assertEqual(len(university_news_articles), num_results)
         self.assertEqual(
@@ -227,3 +320,25 @@ def test_get_articles_by_topic_multiple_pages(self):
                 ],
             ),
         )
+
+    @responses.activate
+    def test_get_articles_by_topic_invalid_category(self):
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+        self.assertRaises(ValueError, news.get_articles_by_topic, "University News", "Invalid Category")
+
+    @responses.activate
+    def test_get_articles_by_topic_invalid_topic(self):
+        news.get_categories.cache_clear()
+        news.get_topics.cache_clear()
+        news._scrape_categories.cache_clear()
+        news._scrape_topics.cache_clear()
+        responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
+        responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
+
+        self.assertRaises(ValueError, news.get_articles_by_topic, "Invalid Topic")
diff --git a/tests/samples/news_features_articles.html b/tests/samples/news_features_articles.html
diff --git a/tests/samples/news_features_articles_no_topics.html b/tests/samples/news_features_articles_no_topics.html
diff --git a/tests/samples/news_pittwire.html b/tests/samples/news_pittwire.html
diff --git a/tests/samples/news_pittwire_no_categories.html b/tests/samples/news_pittwire_no_categories.html