Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape categories and topics for news.py #215

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 70 additions & 47 deletions pittapi/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,50 +19,20 @@

from __future__ import annotations

from functools import cache
import math
from requests_html import Element, HTMLResponse, HTMLSession
from typing import Literal, NamedTuple
from typing import NamedTuple

NUM_ARTICLES_PER_PAGE = 20

NEWS_BY_CATEGORY_URL = (
"https://www.pitt.edu/pittwire/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
PITT_BASE_URL = "https://www.pitt.edu"
PITTWIRE_URL = PITT_BASE_URL + "/pittwire"
FEATURES_ARTICLES_URL = PITTWIRE_URL + "/news/features-articles"
NEWS_BY_CATEGORY_URL = PITTWIRE_URL + (
"/news/{category}?field_topics_target_id={topic_id}&field_article_date_value={year}"
"&title={query}&field_category_target_id=All&page={page_num}"
)
PITT_BASE_URL = "https://www.pitt.edu"

Category = Literal["features-articles", "accolades-honors", "ones-to-watch", "announcements-and-updates"]
Topic = Literal[
"university-news",
"health-and-wellness",
"technology-and-science",
"arts-and-humanities",
"community-impact",
"innovation-and-research",
"global",
"diversity-equity-and-inclusion",
"our-city-our-campus",
"teaching-and-learning",
"space",
"ukraine",
"sustainability",
]

TOPIC_ID_MAP: dict[Topic, int] = {
"university-news": 432,
"health-and-wellness": 2,
"technology-and-science": 391,
"arts-and-humanities": 4,
"community-impact": 6,
"innovation-and-research": 1,
"global": 9,
"diversity-equity-and-inclusion": 8,
"our-city-our-campus": 12,
"teaching-and-learning": 7,
"space": 440,
"ukraine": 441,
"sustainability": 470,
}

sess = HTMLSession()

Expand All @@ -87,18 +57,51 @@ def from_html(cls, article_html: Element) -> Article:
return cls(title=article_title, description=article_description, url=article_url, tags=article_tags)


def _get_page_articles(
topic: Topic,
category: Category,
query: str,
year: int | None,
page_num: int,
) -> list[Article]:
@cache
def _scrape_categories() -> dict[str, str]:
response: HTMLResponse = sess.get(PITTWIRE_URL)
category_menu: Element = response.html.find("div#block-views-block-category-menu-category-menu", first=True)
category_list: list[Element] = category_menu.find("ul.hamburger-menu-list li")
category_map: dict[str, str] = {}
for category in category_list:
category_link: Element = category.find("a", first=True)
category_url_name = category_link.attrs["href"].split("/")[-1]
category_map[category.text.strip()] = category_url_name
if not category_map:
raise RuntimeError("No categories found, please open a GitHub issue")
return category_map


@cache
def _scrape_topics() -> dict[str, int]:
response: HTMLResponse = sess.get(FEATURES_ARTICLES_URL)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
topic_fieldset: Element = main_content.find("fieldset.form-item-field-topics-target-id", first=True)
topic_options: list[Element] = topic_fieldset.find("option")
topic_map: dict[str, int] = {}
for topic_option in topic_options:
if (topic_id := topic_option.attrs["value"].strip()) == "All": # Skip placeholder "Topics" option
continue
topic_name = topic_option.text.strip()
topic_map[topic_name] = int(topic_id)
if not topic_map:
raise RuntimeError("No topics found, please open a GitHub issue")
return topic_map


def _get_page_articles(topic: str, category: str, query: str, year: int | None, page_num: int) -> list[Article]:
topic_id_map = _scrape_topics()
category_url_name_map = _scrape_categories()
year_str = str(year) if year else ""
page_num_str = str(page_num) if page_num else ""

response: HTMLResponse = sess.get(
NEWS_BY_CATEGORY_URL.format(
category=category, topic_id=TOPIC_ID_MAP[topic], year=year_str, query=query, page_num=page_num_str
category=category_url_name_map[category],
topic_id=topic_id_map[topic],
year=year_str,
query=query,
page_num=page_num_str,
)
)
main_content: Element = response.html.xpath("/html/body/div/main/div/section", first=True)
Expand All @@ -107,13 +110,33 @@ def _get_page_articles(
return page_articles


@cache
def get_categories() -> list[str]:
category_url_name_map = _scrape_categories()
return list(category_url_name_map.keys())


@cache
def get_topics() -> list[str]:
topic_id_map = _scrape_topics()
return list(topic_id_map.keys())


def get_articles_by_topic(
topic: Topic,
category: Category = "features-articles",
topic: str,
category: str = "Features & Articles",
query: str = "",
year: int | None = None,
max_num_results: int = NUM_ARTICLES_PER_PAGE,
) -> list[Article]:
topic_id_map = _scrape_topics()
if topic not in topic_id_map:
raise ValueError(f"'{topic}' is not a valid topic, must be one of the following: {get_topics()}")

category_url_name_map = _scrape_categories()
if category not in category_url_name_map:
raise ValueError(f"'{category}' is not a valid category, must be one of the following: {get_categories()}")

num_pages = math.ceil(max_num_results / NUM_ARTICLES_PER_PAGE)

# Get articles sequentially and synchronously (i.e., not using grequests) because the news pages must stay in order
Expand Down
125 changes: 120 additions & 5 deletions tests/news_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
class NewsTest(unittest.TestCase):
def __init__(self, *args, **kwargs):
unittest.TestCase.__init__(self, *args, **kwargs)
with (SAMPLE_PATH / "news_pittwire.html").open() as f:
self.pittwire = f.read()
with (SAMPLE_PATH / "news_pittwire_no_categories.html").open() as f:
self.pittwire_no_categories = f.read()
with (SAMPLE_PATH / "news_features_articles.html").open() as f:
self.features_articles = f.read()
with (SAMPLE_PATH / "news_features_articles_no_topics.html").open() as f:
self.features_articles_no_topics = f.read()
with (SAMPLE_PATH / "news_university_news_features_articles_page_0.html").open() as f:
self.university_news_features_articles_page_0 = f.read()
with (SAMPLE_PATH / "news_university_news_features_articles_page_1.html").open() as f:
Expand All @@ -39,16 +47,77 @@ def __init__(self, *args, **kwargs):
with (SAMPLE_PATH / "news_university_news_features_articles_2020.html").open() as f:
self.university_news_features_articles_2020 = f.read()

@responses.activate
def test_get_categories(self):
news.get_categories.cache_clear()
news._scrape_categories.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)

categories = news.get_categories()

self.assertCountEqual(
categories, ["Features & Articles", "Accolades & Honors", "Ones to Watch", "Announcements and Updates"]
)

@responses.activate
def test_get_categories_missing(self):
news.get_categories.cache_clear()
news._scrape_categories.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire_no_categories)

self.assertRaises(RuntimeError, news.get_categories)

@responses.activate
def test_get_topics(self):
news.get_topics.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)

topics = news.get_topics()

self.assertCountEqual(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this test going to hold true even into the future?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, all the tests for the news module are mocked. The mock files are in the tests/samples directory.

topics,
[
"University News",
"Health and Wellness",
"Technology & Science",
"Arts and Humanities",
"Community Impact",
"Innovation and Research",
"Global",
"Diversity, Equity, and Inclusion",
"Our City/Our Campus",
"Teaching & Learning",
"Space",
"Ukraine",
"Sustainability",
],
)

@responses.activate
def test_get_topics_missing(self):
news.get_topics.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles_no_topics)

self.assertRaises(RuntimeError, news.get_topics)

@responses.activate
def test_get_articles_by_topic(self):
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
"&field_category_target_id=All",
body=self.university_news_features_articles_page_0,
)

university_news_articles = news.get_articles_by_topic("university-news")
university_news_articles = news.get_articles_by_topic("University News")

self.assertEqual(len(university_news_articles), news.NUM_ARTICLES_PER_PAGE)
self.assertEqual(
Expand All @@ -75,14 +144,20 @@ def test_get_articles_by_topic(self):
@responses.activate
def test_get_articles_by_topic_query(self):
query = "fulbright"
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value="
f"&title={query}&field_category_target_id=All",
body=self.university_news_features_articles_fulbright,
)

university_news_articles = news.get_articles_by_topic("university-news", query=query)
university_news_articles = news.get_articles_by_topic("University News", query=query)

self.assertEqual(len(university_news_articles), 3)
self.assertEqual(
Expand Down Expand Up @@ -115,14 +190,20 @@ def test_get_articles_by_topic_query(self):
@responses.activate
def test_get_articles_by_topic_year(self):
year = 2020
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
f"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value={year}"
"&title=&field_category_target_id=All",
body=self.university_news_features_articles_2020,
)

university_news_articles = news.get_articles_by_topic("university-news", year=year)
university_news_articles = news.get_articles_by_topic("University News", year=year)

self.assertEqual(len(university_news_articles), 5)
self.assertEqual(
Expand Down Expand Up @@ -152,14 +233,20 @@ def test_get_articles_by_topic_year(self):
@responses.activate
def test_get_articles_by_topic_less_than_one_page(self):
num_results = 5
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
"&field_category_target_id=All",
body=self.university_news_features_articles_page_0,
)

university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)

self.assertEqual(len(university_news_articles), num_results)
self.assertEqual(
Expand All @@ -186,6 +273,12 @@ def test_get_articles_by_topic_less_than_one_page(self):
@responses.activate
def test_get_articles_by_topic_multiple_pages(self):
num_results = news.NUM_ARTICLES_PER_PAGE + 5
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)
responses.add(
responses.GET,
"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title="
Expand All @@ -199,7 +292,7 @@ def test_get_articles_by_topic_multiple_pages(self):
body=self.university_news_features_articles_page_1,
)

university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results)
university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results)

self.assertEqual(len(university_news_articles), num_results)
self.assertEqual(
Expand Down Expand Up @@ -227,3 +320,25 @@ def test_get_articles_by_topic_multiple_pages(self):
],
),
)

@responses.activate
def test_get_articles_by_topic_invalid_category(self):
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)

self.assertRaises(ValueError, news.get_articles_by_topic, "University News", "Invalid Category")

@responses.activate
def test_get_articles_by_topic_invalid_topic(self):
news.get_categories.cache_clear()
news.get_topics.cache_clear()
news._scrape_categories.cache_clear()
news._scrape_topics.cache_clear()
responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire)
responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles)

self.assertRaises(ValueError, news.get_articles_by_topic, "Invalid Topic")
1,839 changes: 1,839 additions & 0 deletions tests/samples/news_features_articles.html

Large diffs are not rendered by default.

1,838 changes: 1,838 additions & 0 deletions tests/samples/news_features_articles_no_topics.html

Large diffs are not rendered by default.

1,942 changes: 1,942 additions & 0 deletions tests/samples/news_pittwire.html

Large diffs are not rendered by default.

1,935 changes: 1,935 additions & 0 deletions tests/samples/news_pittwire_no_categories.html

Large diffs are not rendered by default.

Loading