From 72faea29b0d737f0d5eaf0faa4ea33c9981df913 Mon Sep 17 00:00:00 2001
From: Samyak Shah <samyak.shahfamily@gmail.com>
Date: Tue, 17 Sep 2024 11:07:53 -0400
Subject: [PATCH] Feature/updating indeed scraper (#166) (#170)

* Feature/updating indeed scraper (#166)

* - Updated to mobile endpoints and user agents to prevent CAPTCHA
- Updated parsing of indeed scraper
- Fixed tags not being parsed correctly
- Fixed remoteness not being parsed correctly
- Changed to only scrape the first page of each search by default for speed

* - Updated method of loading user agent files
- Updated user agent file of indeed scraper

* - Updated versions in requirements.txt
- Added in black configuration file for formatting
- Added a pre-commit hook so all contributors will have consistent
  formatting on upload
- Updated all python files to conform to black formatter

* Updated Python version

* More black formatting updates

* - Added prettierrc and prettierignore
- Formatted all files other than python

* Updated prettierignore so prettier can search through subdirectories

* Reset formatting to longer line width

* Reverted to previous commit

* Updating again to longer line width after accounting for missing files

* Updated prettierrc and prettierignore files and reran formatting

* Updated version

* - Reverted Markdown changes
- Reverted settings_USA changes
- Updated readme
- Removed extra user-agent from phone user agents list
- Removed extra comments

* Changed readme to refer to python 3.11 instead of 3.8, and added the mobile user agent list to the MANIFEST.in
---
 .pre-commit-config.yaml                       |   9 +
 MANIFEST.in                                   |   1 +
 demo/settings_USA.yaml                        |   1 -
 jobfunnel/__init__.py                         |   3 +-
 jobfunnel/backend/job.py                      |  40 +++-
 jobfunnel/backend/jobfunnel.py                |   6 +-
 jobfunnel/backend/scrapers/base.py            |   1 +
 jobfunnel/backend/scrapers/glassdoor.py       |   1 +
 jobfunnel/backend/scrapers/indeed.py          | 222 ++++++++++++++----
 jobfunnel/backend/scrapers/monster.py         |   1 +
 jobfunnel/backend/scrapers/registry.py        |   1 +
 jobfunnel/backend/tools/delay.py              |   1 +
 jobfunnel/backend/tools/filters.py            |   1 +
 jobfunnel/backend/tools/tools.py              |   1 +
 jobfunnel/config/base.py                      |   1 +
 jobfunnel/config/cli.py                       |   1 +
 jobfunnel/config/delay.py                     |   1 +
 jobfunnel/config/manager.py                   |   1 +
 jobfunnel/config/proxy.py                     |   1 +
 jobfunnel/config/search.py                    |   1 +
 jobfunnel/config/settings.py                  |   1 +
 jobfunnel/resources/defaults.py               |   1 +
 jobfunnel/resources/resources.py              |  34 ++-
 .../resources/user_agent_list_mobile.txt      |  14 ++
 readme.md                                     |   6 +-
 requirements.txt                              |   2 +
 setup.py                                      |   4 +-
 tests/config/test_cli.py                      |   1 +
 tests/config/test_delay.py                    |   1 +
 tests/config/test_search.py                   |   1 +
 30 files changed, 286 insertions(+), 74 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 jobfunnel/resources/user_agent_list_mobile.txt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..4ea10949
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 24.8.0 # Replace this with the version of Black you want to use
+    hooks:
+      - id: black
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: "v3.1.0" # Specify Prettier version
+    hooks:
+      - id: prettier
diff --git a/MANIFEST.in b/MANIFEST.in
index ba64c426..b77d8c93 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
 include jobfunnel/demo/settings.yaml
 include jobfunnel/demo/demo.png
 include jobfunnel/resources/user_agent_list.txt
+include jobfunnel/resources/user_agent_list_mobile.txt
 include readme.md
 include LICENSE
diff --git a/demo/settings_USA.yaml b/demo/settings_USA.yaml
index 8646874c..af9e7a48 100644
--- a/demo/settings_USA.yaml
+++ b/demo/settings_USA.yaml
@@ -22,7 +22,6 @@ search:
   # FIXME: we need to add back GLASSDOOR when that's working again.
   providers:
     - INDEED
-    - MONSTER
 
   # Region that we are searching for jobs within:
   province_or_state: "Texas" # NOTE: this is generally 2 characters long.
diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
index 8feda0ab..0cb91dcc 100644
--- a/jobfunnel/__init__.py
+++ b/jobfunnel/__init__.py
@@ -1,3 +1,4 @@
 """JobFunnel base package init, we keep module version here.
 """
-__version__ = "3.0.2"
+
+__version__ = "4.0.0"
diff --git a/jobfunnel/backend/job.py b/jobfunnel/backend/job.py
index 1a74ba4b..2a95c02c 100644
--- a/jobfunnel/backend/job.py
+++ b/jobfunnel/backend/job.py
@@ -1,6 +1,7 @@
 """Base Job class to be populated by Scrapers, manipulated by Filters and saved
 to csv / etc by Exporter
 """
+
 from copy import deepcopy
 from datetime import date, datetime
 from typing import Dict, List, Optional
@@ -132,7 +133,7 @@ def update_if_newer(self, job: "Job") -> bool:
         Returns:
             True if we updated self with job, False if we didn't
         """
-        if job.post_date > self.post_date:
+        if job.post_date >= self.post_date:
             # Update all attrs other than status (which user can set).
             self.company = deepcopy(job.company)
             self.location = deepcopy(job.location)
@@ -152,6 +153,7 @@ def update_if_newer(self, job: "Job") -> bool:
             # pylint: disable=protected-access
             self._raw_scrape_data = deepcopy(job._raw_scrape_data)
             # pylint: enable=protected-access
+
             return True
         else:
             return False
@@ -187,7 +189,7 @@ def as_row(self) -> Dict[str, str]:
                         self.location,
                         self.post_date.strftime("%Y-%m-%d"),
                         self.description,
-                        ", ".join(self.tags),
+                        "\n".join(self.tags),
                         self.url,
                         self.key_id,
                         self.provider,
@@ -210,9 +212,11 @@ def as_json_entry(self) -> Dict[str, str]:
             "title": self.title,
             "company": self.company,
             "post_date": self.post_date.strftime("%Y-%m-%d"),
-            "description": (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
-            if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
-            else (self.description),
+            "description": (
+                (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
+                if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
+                else (self.description)
+            ),
             "status": self.status.name,
         }
 
@@ -243,3 +247,29 @@ def validate(self) -> None:
         assert self.url, "URL is unset!"
         if len(self.description) < MIN_DESCRIPTION_CHARS:
             raise ValueError("Description too short!")
+
+    def __repr__(self) -> str:
+        """Developer-friendly representation of the Job object."""
+        return (
+            f"Job("
+            f"title='{self.title}', "
+            f"company='{self.company}', "
+            f"location='{self.location}', "
+            f"status={self.status.name}, "
+            f"post_date={self.post_date}, "
+            f"url='{self.url}')"
+        )
+
+    def __str__(self) -> str:
+        """Human-readable string representation of the Job object."""
+        return (
+            f"Job Title: {self.title}\n"
+            f"Company: {self.company}\n"
+            f"Location: {self.location}\n"
+            f"Post Date: {self.post_date.strftime('%Y-%m-%d') if self.post_date else 'N/A'}\n"
+            f"Status: {self.status.name}\n"
+            f"Wage: {self.wage if self.wage else 'N/A'}\n"
+            f"Remoteness: {self.remoteness if self.remoteness else 'N/A'}\n"
+            f"Description (truncated): {self.description[:100]}{'...' if len(self.description) > 100 else ''}\n"
+            f"URL: {self.url}\n"
+        )
diff --git a/jobfunnel/backend/jobfunnel.py b/jobfunnel/backend/jobfunnel.py
index ff1b58b5..113cc8c7 100755
--- a/jobfunnel/backend/jobfunnel.py
+++ b/jobfunnel/backend/jobfunnel.py
@@ -1,6 +1,7 @@
 """Scrapes jobs, applies search filters and writes pickles to master list
 Paul McInnis 2020
 """
+
 import csv
 import json
 import os
@@ -230,7 +231,9 @@ def scrape(self) -> Dict[str, Job]:
             try:
                 incoming_jobs_dict = scraper.scrape()
             except Exception as e:
-                self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}")
+                self.logger.error(
+                    f"Failed to scrape jobs for {scraper_cls.__name__}: {e}"
+                )
 
             # Ensure we have no duplicates between our scrapers by key-id
             # (since we are updating the jobs dict with results)
@@ -425,6 +428,7 @@ def read_master_csv(self) -> Dict[str, Job]:
                     short_description=short_description,
                     post_date=post_date,
                     scrape_date=scrape_date,
+                    wage=wage,
                     raw=raw,
                     tags=row["tags"].split(","),
                     remoteness=remoteness,
diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py
index 77084d1f..28ba5d3f 100644
--- a/jobfunnel/backend/scrapers/base.py
+++ b/jobfunnel/backend/scrapers/base.py
@@ -1,6 +1,7 @@
 """The base scraper class to be used for all web-scraping emitting Job objects
 Paul McInnis 2020
 """
+
 import random
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor, as_completed
diff --git a/jobfunnel/backend/scrapers/glassdoor.py b/jobfunnel/backend/scrapers/glassdoor.py
index a8e3d6f7..fa220ba4 100644
--- a/jobfunnel/backend/scrapers/glassdoor.py
+++ b/jobfunnel/backend/scrapers/glassdoor.py
@@ -1,6 +1,7 @@
 """Scraper for www.glassdoor.X
 FIXME: this is currently unable to get past page 1 of job results.
 """
+
 import re
 from abc import abstractmethod
 from concurrent.futures import ThreadPoolExecutor, wait
diff --git a/jobfunnel/backend/scrapers/indeed.py b/jobfunnel/backend/scrapers/indeed.py
index 83444adb..b201c8f0 100644
--- a/jobfunnel/backend/scrapers/indeed.py
+++ b/jobfunnel/backend/scrapers/indeed.py
@@ -1,10 +1,13 @@
 """Scraper designed to get jobs from www.indeed.X
 """
+
 import re
 from concurrent.futures import ThreadPoolExecutor, wait
 from math import ceil
 from typing import Any, Dict, List, Optional
 from unicodedata import normalize
+import json
+import random
 
 from bs4 import BeautifulSoup
 from requests import Session
@@ -20,7 +23,12 @@
 )
 from jobfunnel.backend.tools.filters import JobFilter
 from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
-from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Remoteness
+from jobfunnel.resources import (
+    MAX_CPU_WORKERS,
+    JobField,
+    Remoteness,
+    USER_AGENT_LIST_MOBILE,
+)
 
 # pylint: disable=using-constant-test,unused-import
 if False:  # or typing.TYPE_CHECKING  if python3.5.3+
@@ -28,7 +36,7 @@
 # pylint: enable=using-constant-test,unused-import
 
 ID_REGEX = re.compile(r"id=\"sj_([a-zA-Z0-9]*)\"")
-MAX_RESULTS_PER_INDEED_PAGE = 50
+MAX_RESULTS_PER_INDEED_PAGE = 20  # 20 results for mobile, 50 for desktop
 # NOTE: these magic strings stick for both the US and CAN indeed websites...
 FULLY_REMOTE_MAGIC_STRING = "&remotejob=032b3046-06a3-4876-8dfd-474eb5e7ed11"
 COVID_REMOTE_MAGIC_STRING = "&remotejob=7e3167e4-ccb4-49cb-b761-9bae564a0a63"
@@ -41,10 +49,34 @@
 }
 REMOTENESS_STR_MAP = {
     "remote": Remoteness.FULLY_REMOTE,
-    "temporarily remote": Remoteness.TEMPORARILY_REMOTE,
+    "hybrid work": Remoteness.TEMPORARILY_REMOTE,
 }
 
 
+def format_taxonomy_attributes(taxonomy_attributes):
+    result = []
+
+    # Loop through the taxonomyAttributes list
+    for category in taxonomy_attributes:
+        label = category[
+            "label"
+        ]  # Get the category label (e.g., "job-types", "benefits")
+        attributes = category["attributes"]
+
+        # Only process if the attributes list is not empty
+        if attributes:
+            # Get all attribute labels within the category
+            attribute_labels = [attr["label"] for attr in attributes]
+            # Create a readable string combining the category label and its attributes
+            formatted_str = (
+                f"{label.replace('-', ' ').capitalize()}: {', '.join(attribute_labels)}"
+            )
+            result.append(formatted_str)
+
+    # Join all the formatted strings with a line break or any separator
+    return result
+
+
 class BaseIndeedScraper(BaseScraper):
     """Scrapes jobs from www.indeed.X"""
 
@@ -60,6 +92,11 @@ def __init__(
         if self.config.search_config.remoteness == Remoteness.PARTIALLY_REMOTE:
             self.logger.warning("Indeed does not support PARTIALLY_REMOTE jobs")
 
+    @property
+    def user_agent(self) -> str:
+        """Get a randomized user agent for this scraper"""
+        return random.choice(USER_AGENT_LIST_MOBILE)
+
     @property
     def job_get_fields(self) -> str:
         """Call self.get(...) for the JobFields in this list when scraping a Job
@@ -69,11 +106,12 @@ def job_get_fields(self) -> str:
         return [
             JobField.TITLE,
             JobField.COMPANY,
+            JobField.DESCRIPTION,
             JobField.LOCATION,
             JobField.KEY_ID,
             JobField.TAGS,
             JobField.POST_DATE,
-            JobField.REMOTENESS,
+            # JobField.REMOTENESS,
             JobField.WAGE,
         ]
 
@@ -86,7 +124,7 @@ def job_set_fields(self) -> str:
 
         Override this as needed.
         """
-        return [JobField.RAW, JobField.URL, JobField.DESCRIPTION]
+        return [JobField.URL, JobField.REMOTENESS]
 
     @property
     def delayed_get_set_fields(self) -> str:
@@ -159,47 +197,59 @@ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]:
         return job_soup_list
 
     def get(self, parameter: JobField, soup: BeautifulSoup) -> Any:
-        """Get a single job attribute from a soup object by JobField"""
+        """Get a single job attribute from a soup object that was derived from a JSON string."""
+
+        # Convert BeautifulSoup object back to a dictionary
+        job_data = json.loads(soup.text)
+
         if parameter == JobField.TITLE:
-            return soup.find("a", attrs={"data-tn-element": "jobTitle"}).text.strip()
+            return job_data.get("displayTitle", None)
+
+        elif parameter == JobField.DESCRIPTION:
+            return job_data.get("snippet", None)
+
         elif parameter == JobField.COMPANY:
-            return soup.find("span", attrs={"class": "company"}).text.strip()
+            return job_data.get("company", None)
+
         elif parameter == JobField.LOCATION:
-            return soup.find("span", attrs={"class": "location"}).text.strip()
+            return job_data.get("formattedLocation", None)
+
         elif parameter == JobField.TAGS:
-            # tags may not be on page and that's ok.
-            table_soup = soup.find("table", attrs={"class": "jobCardShelfContainer"})
-            if table_soup:
-                return [
-                    td.text.strip()
-                    for td in table_soup.find_all(
-                        "td", attrs={"class": "jobCardShelfItem"}
-                    )
-                ]
-            else:
-                return []
+
+            formatted_attributes = format_taxonomy_attributes(
+                job_data.get("taxonomyAttributes", [])
+            )
+
+            return formatted_attributes
+
         elif parameter == JobField.REMOTENESS:
-            remote_field = soup.find("span", attrs={"class": "remote"})
-            if remote_field:
-                remoteness_str = remote_field.text.strip().lower()
-                if remoteness_str in REMOTENESS_STR_MAP:
-                    return REMOTENESS_STR_MAP[remoteness_str]
-            return Remoteness.UNKNOWN
+            return (
+                Remoteness.FULLY_REMOTE
+                if job_data.get("remoteLocation", False)
+                else Remoteness.UNKNOWN
+            )
+
         elif parameter == JobField.WAGE:
-            # We may not be able to obtain a wage
-            potential = soup.find("span", attrs={"class": "salaryText"})
-            if potential:
-                return potential.text.strip()
-            else:
-                return ""
+            salary_info = job_data.get("extractedSalary", None)
+            if salary_info:
+                min_salary = salary_info.get("min")
+                max_salary = salary_info.get("max")
+                if min_salary and max_salary:
+                    return (
+                        f"${min_salary} - ${max_salary} {salary_info.get('type', '')}"
+                    )
+                else:
+                    return ""
+            return ""
+
         elif parameter == JobField.POST_DATE:
             return calc_post_date_from_relative_str(
-                soup.find("span", attrs={"class": "date"}).text.strip()
+                job_data.get("formattedRelativeTime", None)
             )
+
         elif parameter == JobField.KEY_ID:
-            return ID_REGEX.findall(
-                str(soup.find("a", attrs={"class": "sl resultLink save-job-link"}))
-            )[0]
+            return job_data.get("jobkey", None)
+
         else:
             raise NotImplementedError(f"Cannot get {parameter.name}")
 
@@ -211,6 +261,19 @@ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
             job._raw_scrape_data = BeautifulSoup(
                 self.session.get(job.url).text, self.config.bs4_parser
             )
+
+        elif parameter == JobField.REMOTENESS:
+            remoteness = [
+                tag.split(":")[-1].strip().lower()
+                for tag in job.tags
+                if "remote" in tag.lower()
+            ]
+
+            if len(remoteness):
+                job.remoteness = REMOTENESS_STR_MAP.get(
+                    remoteness[0], Remoteness.UNKNOWN
+                )
+
         elif parameter == JobField.DESCRIPTION:
             assert job._raw_scrape_data
             job.description = job._raw_scrape_data.find(
@@ -219,7 +282,7 @@ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
         elif parameter == JobField.URL:
             assert job.key_id
             job.url = (
-                f"http://www.indeed.{self.config.search_config.domain}/"
+                f"https://www.indeed.{self.config.search_config.domain}/m/"
                 f"viewjob?jk={job.key_id}"
             )
         else:
@@ -231,7 +294,7 @@ def _get_search_url(self, method: Optional[str] = "get") -> str:
         """
         if method == "get":
             return (
-                "https://www.indeed.{}/jobs?q={}&l={}%2C+{}&radius={}&"
+                "https://www.indeed.{}/m/jobs?q={}&l={}%2C+{}&radius={}&"
                 "limit={}&filter={}{}".format(
                     self.config.search_config.domain,
                     self.query,
@@ -280,17 +343,61 @@ def _get_job_soups_from_search_page(
         NOTE: Indeed's remoteness filter sucks, and we will always see a mix.
             ... need to add some kind of filtering for this!
         """
-        url = f"{search}&start={int(page * self.max_results_per_page)}"
-        job_soup_list.extend(
-            BeautifulSoup(self.session.get(url).text, self.config.bs4_parser).find_all(
-                "div", attrs={"data-tn-component": "organicJob"}
+        url = f"{search}&start={page * self.max_results_per_page}"
+
+        try:
+            response = self.session.get(url).text
+            soup = BeautifulSoup(response, self.config.bs4_parser)
+
+            script_tag = soup.find("script", id="mosaic-data")
+            if not script_tag:
+                self.logger.warn("No 'mosaic-data' script tag found on the page.")
+                return
+
+            script_content = script_tag.string
+            json_regex = re.search(
+                r'\["mosaic-provider-jobcards"\]\s*=\s*(\{.*?\});',
+                script_content,
+                re.DOTALL,
+            )
+
+            if json_regex:
+                json_data_str = json_regex.group(1)
+
+                try:
+                    json_data = json.loads(json_data_str)
+                    job_data = (
+                        json_data.get("metaData", {})
+                        .get("mosaicProviderJobCardsModel", {})
+                        .get("results", [])
+                    )
+
+                    if job_data:
+                        job_data_json = [json.dumps(job) for job in job_data]
+                        job_soup_list.extend(
+                            [
+                                BeautifulSoup(job_json, "lxml")
+                                for job_json in job_data_json
+                            ]
+                        )
+                    else:
+                        self.logger.error("No job data found in the JSON structure.")
+                except json.JSONDecodeError as e:
+                    self.logger.error(f"Error decoding JSON: {e}")
+            else:
+                self.logger.error(
+                    "No matching job data found in the script tag content."
+                )
+
+        except Exception as e:
+            self.logger.error(
+                f"An error occurred while fetching or parsing the page: {e}"
             )
-        )
 
     def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
         """Calculates the number of pages of job listings to be scraped.
 
-        i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
+        i.e. your search yields 230 results at 20 res/page -> 12 pages of jobs
 
         Args:
                         max_pages: the maximum number of pages to be scraped.
@@ -300,8 +407,13 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
         # Get the html data, initialize bs4 with lxml
         request_html = self.session.get(search_url)
         self.logger.debug("Got Base search results page: %s", search_url)
+
         query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser)
-        num_res = query_resp.find(id="searchCountPages")
+
+        num_res = query_resp.find(
+            "div", class_="jobsearch-JobCountAndSortPane-jobCount"
+        )
+
         # TODO: we should consider expanding the error cases (scrape error page)
         if not num_res:
             raise ValueError(
@@ -311,8 +423,15 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
                 " province or state.".format(search_url)
             )
 
-        num_res = num_res.contents[0].strip()
-        num_res = int(re.findall(r"f (\d+) ", num_res.replace(",", ""))[0])
+        num_res_text = num_res.get_text().replace(",", "")
+
+        num_res_match = re.search(r"(\d+)\+?\s+jobs", num_res_text)
+
+        if num_res_match:
+            num_res = int(num_res_match.group(1))
+        else:
+            num_res = 0
+
         number_of_pages = int(ceil(num_res / self.max_results_per_page))
         if max_pages == 0:
             return number_of_pages
@@ -391,7 +510,7 @@ def _get_search_url(self, method: Optional[str] = "get") -> str:
     def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
         """Calculates the number of pages of job listings to be scraped.
 
-        i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs
+        i.e. your search yields 230 results at 20 res/page -> 12 pages of jobs
 
         Args:
                         max_pages: the maximum number of pages to be scraped.
@@ -467,9 +586,12 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
         """
         # Get the html data, initialize bs4 with lxml
         request_html = self.session.get(search_url)
-        self.logger.debug("Got Base search results page: %s", search_url)
+
         query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser)
-        num_res = query_resp.find(id="searchCountPages")
+        num_res = query_resp.find(
+            "div", class_="jobsearch-JobCountAndSortPane-jobCount"
+        )
+
         if not num_res:
             raise ValueError(
                 "Unable to identify number of pages of results for query: {}"
diff --git a/jobfunnel/backend/scrapers/monster.py b/jobfunnel/backend/scrapers/monster.py
index 2a60fae4..a92c9250 100644
--- a/jobfunnel/backend/scrapers/monster.py
+++ b/jobfunnel/backend/scrapers/monster.py
@@ -1,5 +1,6 @@
 """Scrapers for www.monster.X
 """
+
 import re
 from abc import abstractmethod
 from math import ceil
diff --git a/jobfunnel/backend/scrapers/registry.py b/jobfunnel/backend/scrapers/registry.py
index 5933ba72..2e52d4ea 100644
--- a/jobfunnel/backend/scrapers/registry.py
+++ b/jobfunnel/backend/scrapers/registry.py
@@ -3,6 +3,7 @@
 NOTE: if you implement a scraper you must add it here
 TODO: there must be a better way to do this by using class attrib of Provider
 """
+
 from jobfunnel.resources import Locale, Provider
 
 from jobfunnel.backend.scrapers.indeed import (
diff --git a/jobfunnel/backend/tools/delay.py b/jobfunnel/backend/tools/delay.py
index afa545a9..a4346117 100644
--- a/jobfunnel/backend/tools/delay.py
+++ b/jobfunnel/backend/tools/delay.py
@@ -1,5 +1,6 @@
 """Module for calculating random or non-random delay
 """
+
 from math import ceil, log, sqrt
 from random import uniform
 from typing import List, Union
diff --git a/jobfunnel/backend/tools/filters.py b/jobfunnel/backend/tools/filters.py
index 9269c62b..e107b17d 100644
--- a/jobfunnel/backend/tools/filters.py
+++ b/jobfunnel/backend/tools/filters.py
@@ -2,6 +2,7 @@
 filters to reduce un-necessesary scraping
 Paul McInnis 2020
 """
+
 import logging
 from collections import namedtuple
 from copy import deepcopy
diff --git a/jobfunnel/backend/tools/tools.py b/jobfunnel/backend/tools/tools.py
index efd068ab..a198d160 100644
--- a/jobfunnel/backend/tools/tools.py
+++ b/jobfunnel/backend/tools/tools.py
@@ -1,5 +1,6 @@
 """Assorted tools for all aspects of funnelin' that don't fit elsewhere
 """
+
 import logging
 import re
 import sys
diff --git a/jobfunnel/config/base.py b/jobfunnel/config/base.py
index 2c63bf34..3369dc84 100644
--- a/jobfunnel/config/base.py
+++ b/jobfunnel/config/base.py
@@ -1,5 +1,6 @@
 """Base config object with a validator
 """
+
 from abc import ABC, abstractmethod
 
 
diff --git a/jobfunnel/config/cli.py b/jobfunnel/config/cli.py
index 09892a93..1683a93b 100644
--- a/jobfunnel/config/cli.py
+++ b/jobfunnel/config/cli.py
@@ -1,5 +1,6 @@
 """Configuration parsing module for CLI --> JobFunnelConfigManager
 """
+
 import argparse
 from typing import Dict, Any, List
 import yaml
diff --git a/jobfunnel/config/delay.py b/jobfunnel/config/delay.py
index d90b468e..c3673a9d 100644
--- a/jobfunnel/config/delay.py
+++ b/jobfunnel/config/delay.py
@@ -1,5 +1,6 @@
 """Simple config object to contain the delay configuration
 """
+
 from jobfunnel.config.base import BaseConfig
 from jobfunnel.resources import DelayAlgorithm
 from jobfunnel.resources.defaults import (
diff --git a/jobfunnel/config/manager.py b/jobfunnel/config/manager.py
index 68d84b95..7d9a8c42 100644
--- a/jobfunnel/config/manager.py
+++ b/jobfunnel/config/manager.py
@@ -1,5 +1,6 @@
 """Config object to run JobFunnel
 """
+
 import logging
 import os
 from typing import List, Optional
diff --git a/jobfunnel/config/proxy.py b/jobfunnel/config/proxy.py
index 9045e514..a75ad665 100644
--- a/jobfunnel/config/proxy.py
+++ b/jobfunnel/config/proxy.py
@@ -1,5 +1,6 @@
 """Proxy configuration for Session()
 """
+
 import ipaddress
 
 from jobfunnel.config import BaseConfig
diff --git a/jobfunnel/config/search.py b/jobfunnel/config/search.py
index a6da1f0e..c207eabe 100644
--- a/jobfunnel/config/search.py
+++ b/jobfunnel/config/search.py
@@ -1,5 +1,6 @@
 """Object to contain job query metadata
 """
+
 from typing import List, Optional
 from jobfunnel.config import BaseConfig
 from jobfunnel.resources import Locale, Provider, Remoteness
diff --git a/jobfunnel/config/settings.py b/jobfunnel/config/settings.py
index 67ecc581..52fedcf5 100644
--- a/jobfunnel/config/settings.py
+++ b/jobfunnel/config/settings.py
@@ -1,5 +1,6 @@
 """Settings YAML Schema w/ validator
 """
+
 import ipaddress
 
 from cerberus import Validator
diff --git a/jobfunnel/resources/defaults.py b/jobfunnel/resources/defaults.py
index 5efc5abc..ad39e682 100644
--- a/jobfunnel/resources/defaults.py
+++ b/jobfunnel/resources/defaults.py
@@ -1,6 +1,7 @@
 """Default arguments for both JobFunnelConfigManager and CLI arguments.
 NOTE: Not all defaults here are used, as we rely on YAML for demo and not kwargs
 """
+
 import os
 from pathlib import Path
 from jobfunnel.resources.enums import Locale, DelayAlgorithm, Provider, Remoteness
diff --git a/jobfunnel/resources/resources.py b/jobfunnel/resources/resources.py
index 95b10e75..03ca6e7a 100644
--- a/jobfunnel/resources/resources.py
+++ b/jobfunnel/resources/resources.py
@@ -1,8 +1,10 @@
 """String-like resouces and other constants are initialized here.
 """
+
 import datetime
 import os
 import string
+from pathlib import Path
 
 # CSV header for output CSV. do not remove anything or you'll break usr's CSV's
 # TODO: need to add short and long descriptions (breaking change)
@@ -36,13 +38,25 @@
 
 PRINTABLE_STRINGS = set(string.printable)
 
-# Load the user agent list once only.
-USER_AGENT_LIST_FILE = os.path.normpath(
-    os.path.join(os.path.dirname(__file__), "user_agent_list.txt")
-)
-USER_AGENT_LIST = []
-with open(USER_AGENT_LIST_FILE) as file:
-    for line in file:
-        li = line.strip()
-        if li and not li.startswith("#"):
-            USER_AGENT_LIST.append(line.rstrip("\n"))
+
+def load_user_agents(file_path):
+    """Loads user agent strings from a file, skipping comments and blank lines."""
+    try:
+        with open(file_path, "r") as file:
+            return [
+                line.strip()
+                for line in file
+                if line.strip() and not line.startswith("#")
+            ]
+    except FileNotFoundError:
+        print(f"File {file_path} not found.")
+        return []
+
+
+# Define the paths
+USER_AGENT_LIST_FILE = Path(__file__).parent / "user_agent_list.txt"
+USER_AGENT_LIST_MOBILE_FILE = Path(__file__).parent / "user_agent_list_mobile.txt"
+
+# Load the lists
+USER_AGENT_LIST = load_user_agents(USER_AGENT_LIST_FILE)
+USER_AGENT_LIST_MOBILE = load_user_agents(USER_AGENT_LIST_MOBILE_FILE)
diff --git a/jobfunnel/resources/user_agent_list_mobile.txt b/jobfunnel/resources/user_agent_list_mobile.txt
new file mode 100644
index 00000000..524b188d
--- /dev/null
+++ b/jobfunnel/resources/user_agent_list_mobile.txt
@@ -0,0 +1,14 @@
+# iPhone
+Mozilla/5.0 (Apple-iPhone7C2/1202.466; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3 Indeed App 225.0
+Mozilla/5.0 (iPhone9,4; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1 Indeed App 225.0
+Mozilla/5.0 (iPhone9,3; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1 Indeed App 225.0
+Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A5370a Safari/604.1 Indeed App 225.0
+Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Indeed App 225.0
+Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1 Indeed App 225.0
+Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/13.2b11866 Mobile/16A366 Safari/605.1.15 Indeed App 225.0
+Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1 Indeed App 225.0
+Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1 Indeed App 225.0
+Mozilla/5.0 (iPhone12,1; U; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1 Indeed App 225.0
+Mozilla/5.0 (iPhone13,2; U; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1 Indeed App 225.0
+Mozilla/5.0 (iPhone14,3; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19A346 Safari/602.1 Indeed App 225.0
+Mozilla/5.0 (iPhone14,6; U; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19E241 Safari/602.1 Indeed App 225.0
\ No newline at end of file
diff --git a/readme.md b/readme.md
index 362b9625..e66e4098 100644
--- a/readme.md
+++ b/readme.md
@@ -3,8 +3,6 @@
 
 Automated tool for scraping job postings into a `.csv` file.
 
-_[Since this project was developed, CAPTCHA has clamped down hard, help us re-build the backend and make this tool useful again!](https://github.com/PaulMcInnis/JobFunnel/discussions/148)_
-
 ### Benefits over job search sites:
 
 * Never see the same job twice!
@@ -16,7 +14,7 @@ _[Since this project was developed, CAPTCHA has clamped down hard, help us re-bu
 
 # Installation
 
-_JobFunnel requires [Python][python] 3.8 or later._
+_JobFunnel requires [Python][python] 3.11 or later._
 
 ```
 pip install git+https://github.com/PaulMcInnis/JobFunnel.git
@@ -113,5 +111,5 @@ Open the master CSV file and update the per-job `status`:
 [cron]:https://en.wikipedia.org/wiki/Cron
 [cron_doc]:docs/crontab/readme.md
 [conc_fut]:https://docs.python.org/dev/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor
-[thread]: https://docs.python.org/3.8/library/threading.html
+[thread]: https://docs.python.org/3.11/library/threading.html
 [delay_jp]:https://github.com/bunsenmurder/Notebooks/blob/master/jobFunnel/delay_algorithm.ipynb
diff --git a/requirements.txt b/requirements.txt
index f93c4ee9..1ecd9c95 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,5 @@ selenium>=3.141.0
 webdriver-manager>=2.4.0
 Cerberus>=1.3.2
 tqdm>=4.47.0
+black>=24.8.0
+pre-commit>=3.8.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 8eef2c20..a9ef2c90 100644
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,7 @@
     author_email="paulmcinnis99@gmail.com",
     url=url,
     license="MIT License",
-    python_requires=">=3.8.0",
+    python_requires=">=3.11",
     install_requires=requires,
     packages=find_packages(exclude=("tests", "docs", "images")),
     include_package_data=True,
@@ -47,6 +47,6 @@
     classifiers=[
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.11",
     ],
 )
diff --git a/tests/config/test_cli.py b/tests/config/test_cli.py
index a1e1a23f..893b85b3 100644
--- a/tests/config/test_cli.py
+++ b/tests/config/test_cli.py
@@ -1,5 +1,6 @@
 """Test CLI parsing --> config dict
 """
+
 import os
 import pytest
 from jobfunnel.config import parse_cli, build_config_dict
diff --git a/tests/config/test_delay.py b/tests/config/test_delay.py
index b05dd9e7..e5befac1 100644
--- a/tests/config/test_delay.py
+++ b/tests/config/test_delay.py
@@ -1,5 +1,6 @@
 """Test the DelayConfig
 """
+
 import pytest
 
 from jobfunnel.config import DelayConfig
diff --git a/tests/config/test_search.py b/tests/config/test_search.py
index c323fb94..2e929af1 100644
--- a/tests/config/test_search.py
+++ b/tests/config/test_search.py
@@ -1,5 +1,6 @@
 """Test the search config
 """
+
 import pytest
 
 from jobfunnel.config import SearchConfig