Skip to content

Commit

Permalink
making ssl parameter and setting default true
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolassaw committed Sep 22, 2024
1 parent f813355 commit f02495a
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 23 deletions.
31 changes: 17 additions & 14 deletions src/scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import importlib
from typing import Optional, Tuple, Callable, Type, List
import importlib.util
import re

class Scraper:
"""Scrape Odyssey html files into an output folder"""
Expand All @@ -19,12 +20,13 @@ def __init__(self):

def set_defaults(
self,
ms_wait: Optional[int] = None,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
court_calendar_link_text: Optional[str] = None,
case_number: Optional[str] = None
) -> Tuple[int, str, str, str, Optional[str]]:
ms_wait: int | None = None,
start_date: str | None = None,
end_date: str | None = None,
court_calendar_link_text: str | None = None,
case_number: str | None = None,
ssl: bool | None = None
) -> Tuple[int, str, str, str, Optional[str], bool]:
"""
Sets default values for the provided optional parameters.
Expand All @@ -50,8 +52,9 @@ def set_defaults(
court_calendar_link_text = court_calendar_link_text if court_calendar_link_text is not None else "Court Calendar"
# case_number defaults to None if not provided
case_number = case_number
ssl = ssl if ssl is not None else True

return ms_wait, start_date, end_date, court_calendar_link_text, case_number
return ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl

def configure_logger(self) -> logging.Logger:
"""
Expand Down Expand Up @@ -84,12 +87,10 @@ def format_county(self, county: str) -> str:
Raises:
TypeError: If the provided county name is not a string.
"""
if not isinstance(county, str):
raise TypeError("The county name must be a string.")

return county.lower()
return re.sub(r'[^\w]+', '', county.lower())

def create_session(self, logger: logging.Logger) -> requests.sessions.Session:
def create_session(self, logger: logging.Logger, ssl) -> requests.sessions.Session:
"""
Creates and configures a requests session for interacting with web pages.
Expand All @@ -104,7 +105,9 @@ def create_session(self, logger: logging.Logger) -> requests.sessions.Session:
"""
# Create and configure the session
session = requests.Session()
session.verify = False # Disable SSL certificate verification

# Optionally SSL certificate verification. Default to True unless False passed.
session.verify = ssl
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

return session
Expand Down Expand Up @@ -631,8 +634,8 @@ def scrape(
case_number: Optional[str],
case_html_path: Optional[str]
) -> None:
ms_wait, start_date, end_date, court_calendar_link_text, case_number = self.set_defaults(
ms_wait, start_date, end_date, court_calendar_link_text, case_number
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = self.set_defaults(
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl
)

logger = self.configure_logger()
Expand Down
19 changes: 10 additions & 9 deletions src/tester/test_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_scrape_main_page(self,
):
scraper_instance = Scraper()
logger = scraper_instance.configure_logger()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
session = scraper_instance.create_session(logger)
main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait)
self.assertIsNotNone(main_page_html, "No main page HTML came through. main_page_html = None.")
Expand Down Expand Up @@ -73,7 +73,7 @@ def test_scrape_search_page(self,
# Look for the court calendar link
scraper_instance = Scraper()
logger = scraper_instance.configure_logger()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
session = scraper_instance.create_session(logger)
search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text)
# Verify the court calendar link
Expand Down Expand Up @@ -113,7 +113,7 @@ def test_get_hidden_values(self,
#Run the function
scraper_instance = Scraper()
logger = scraper_instance.configure_logger()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
hidden_values = scraper_instance.get_hidden_values(odyssey_version, main_soup, search_soup, logger)
self.assertIsNotNone(hidden_values, "No hidden values came through. hidden_values = None.")
self.assertTrue(type(hidden_values) == dict, "The hidden values fields is not a dictionary but it needs to be.")
Expand All @@ -130,7 +130,8 @@ def test_scrape_individual_case(self,
start_date = None,
end_date = None,
court_calendar_link_text = None,
case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html")
case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html"),
ssl = True
):
# This starts a timer to compare the run start time to the last updated time of the resulting HTML to ensure the HTML was created after run start time
now = datetime.now()
Expand All @@ -140,10 +141,10 @@ def test_scrape_individual_case(self,

# Call the functions being tested. In this case, the functions being called are all of the subfunctions required and effectively replicates the shape of scrape.
scraper_instance = Scraper()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
logger = scraper_instance.configure_logger()
county = scraper_instance.format_county(county)
session = scraper_instance.create_session(logger)
session = scraper_instance.create_session(logger, ssl)
case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path
base_url, odyssey_version, notes = scraper_instance.get_ody_link(county, logger)
main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait)
Expand Down Expand Up @@ -193,7 +194,7 @@ def test_scrape_jo_list(self,
):
# This test requires that certain dependency functions run first.
scraper_instance = Scraper()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
logger = scraper_instance.configure_logger()
county = scraper_instance.format_county(county)
session = scraper_instance.create_session(logger)
Expand Down Expand Up @@ -230,7 +231,7 @@ def test_scrape_results_page(self,
hidden_values = hidden_values.replace("'", "\"")
hidden_values = json.loads(hidden_values)
scraper_instance = Scraper()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
logger = scraper_instance.configure_logger()
county = scraper_instance.format_county(county)
session = scraper_instance.create_session(logger)
Expand Down Expand Up @@ -296,7 +297,7 @@ def test_scrape_multiple_cases(self,

# There are some live depency functions that have to be run before the primary code can be run.
scraper_instance = Scraper()
ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number)
ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl)
logger = scraper_instance.configure_logger()
session = scraper_instance.create_session(logger)
case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path
Expand Down

0 comments on commit f02495a

Please sign in to comment.