From 72faea29b0d737f0d5eaf0faa4ea33c9981df913 Mon Sep 17 00:00:00 2001 From: Samyak Shah Date: Tue, 17 Sep 2024 11:07:53 -0400 Subject: [PATCH] Feature/updating indeed scraper (#166) (#170) * Feature/updating indeed scraper (#166) * - Updated to mobile endpoints and user agents to prevent CAPTCHA - Updated parsing of indeed scraper - Fixed tags not being parsed correctly - Fixed remoteness not being parsed correctly - Changed to only scrape the first page of each search by default for speed * - Updated method of loading user agent files - Updated user agent file of indeed scraper * - Updated versions in requirements.txt - Added in black configuration file for formatting - Added a pre-commit hook so all contributors will have consistent formatting on upload - Updated all python files to conform to black formatter * Updated Python version * More black formatting updates * - Added prettierrc and prettierignore - Formatted all files other than python * Updated prettierignore so prettier can search through subdirectories * Reset formatting to longer line width * Reverted to previous commit * Updating again to longer line width after accounting for missing files * Updated prettierrc and prettierignore files and reran formatting * Updated version * - Reverted Markdown changes - Reverted settings_USA changes - Updated readme - Removed extra user-agent from phone user agents list - Removed extra comments * Changed readme to refer to python 3.11 instead of 3.8, and added the mobile user agent list to the MANIFEST.in --- .pre-commit-config.yaml | 9 + MANIFEST.in | 1 + demo/settings_USA.yaml | 1 - jobfunnel/__init__.py | 3 +- jobfunnel/backend/job.py | 40 +++- jobfunnel/backend/jobfunnel.py | 6 +- jobfunnel/backend/scrapers/base.py | 1 + jobfunnel/backend/scrapers/glassdoor.py | 1 + jobfunnel/backend/scrapers/indeed.py | 222 ++++++++++++++---- jobfunnel/backend/scrapers/monster.py | 1 + jobfunnel/backend/scrapers/registry.py | 1 + jobfunnel/backend/tools/delay.py | 1 + jobfunnel/backend/tools/filters.py | 1 + jobfunnel/backend/tools/tools.py | 1 + jobfunnel/config/base.py | 1 + jobfunnel/config/cli.py | 1 + jobfunnel/config/delay.py | 1 + jobfunnel/config/manager.py | 1 + jobfunnel/config/proxy.py | 1 + jobfunnel/config/search.py | 1 + jobfunnel/config/settings.py | 1 + jobfunnel/resources/defaults.py | 1 + jobfunnel/resources/resources.py | 34 ++- .../resources/user_agent_list_mobile.txt | 14 ++ readme.md | 6 +- requirements.txt | 2 + setup.py | 4 +- tests/config/test_cli.py | 1 + tests/config/test_delay.py | 1 + tests/config/test_search.py | 1 + 30 files changed, 286 insertions(+), 74 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 jobfunnel/resources/user_agent_list_mobile.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..4ea10949 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: + - repo: https://github.com/psf/black + rev: 24.8.0 # Replace this with the version of Black you want to use + hooks: + - id: black + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.1.0" # Specify Prettier version + hooks: + - id: prettier diff --git a/MANIFEST.in b/MANIFEST.in index ba64c426..b77d8c93 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include jobfunnel/demo/settings.yaml include jobfunnel/demo/demo.png include jobfunnel/resources/user_agent_list.txt +include jobfunnel/resources/user_agent_list_mobile.txt include readme.md include LICENSE diff --git a/demo/settings_USA.yaml b/demo/settings_USA.yaml index 8646874c..af9e7a48 100644 --- a/demo/settings_USA.yaml +++ b/demo/settings_USA.yaml @@ -22,7 +22,6 @@ search: # FIXME: we need to add back GLASSDOOR when that's working again. providers: - INDEED - - MONSTER # Region that we are searching for jobs within: province_or_state: "Texas" # NOTE: this is generally 2 characters long. diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py index 8feda0ab..0cb91dcc 100644 --- a/jobfunnel/__init__.py +++ b/jobfunnel/__init__.py @@ -1,3 +1,4 @@ """JobFunnel base package init, we keep module version here. """ -__version__ = "3.0.2" + +__version__ = "4.0.0" diff --git a/jobfunnel/backend/job.py b/jobfunnel/backend/job.py index 1a74ba4b..2a95c02c 100644 --- a/jobfunnel/backend/job.py +++ b/jobfunnel/backend/job.py @@ -1,6 +1,7 @@ """Base Job class to be populated by Scrapers, manipulated by Filters and saved to csv / etc by Exporter """ + from copy import deepcopy from datetime import date, datetime from typing import Dict, List, Optional @@ -132,7 +133,7 @@ def update_if_newer(self, job: "Job") -> bool: Returns: True if we updated self with job, False if we didn't """ - if job.post_date > self.post_date: + if job.post_date >= self.post_date: # Update all attrs other than status (which user can set). self.company = deepcopy(job.company) self.location = deepcopy(job.location) @@ -152,6 +153,7 @@ def update_if_newer(self, job: "Job") -> bool: # pylint: disable=protected-access self._raw_scrape_data = deepcopy(job._raw_scrape_data) # pylint: enable=protected-access + return True else: return False @@ -187,7 +189,7 @@ def as_row(self) -> Dict[str, str]: self.location, self.post_date.strftime("%Y-%m-%d"), self.description, - ", ".join(self.tags), + "\n".join(self.tags), self.url, self.key_id, self.provider, @@ -210,9 +212,11 @@ def as_json_entry(self) -> Dict[str, str]: "title": self.title, "company": self.company, "post_date": self.post_date.strftime("%Y-%m-%d"), - "description": (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..") - if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS - else (self.description), + "description": ( + (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..") + if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS + else (self.description) + ), "status": self.status.name, } @@ -243,3 +247,29 @@ def validate(self) -> None: assert self.url, "URL is unset!" if len(self.description) < MIN_DESCRIPTION_CHARS: raise ValueError("Description too short!") + + def __repr__(self) -> str: + """Developer-friendly representation of the Job object.""" + return ( + f"Job(" + f"title='{self.title}', " + f"company='{self.company}', " + f"location='{self.location}', " + f"status={self.status.name}, " + f"post_date={self.post_date}, " + f"url='{self.url}')" + ) + + def __str__(self) -> str: + """Human-readable string representation of the Job object.""" + return ( + f"Job Title: {self.title}\n" + f"Company: {self.company}\n" + f"Location: {self.location}\n" + f"Post Date: {self.post_date.strftime('%Y-%m-%d') if self.post_date else 'N/A'}\n" + f"Status: {self.status.name}\n" + f"Wage: {self.wage if self.wage else 'N/A'}\n" + f"Remoteness: {self.remoteness if self.remoteness else 'N/A'}\n" + f"Description (truncated): {self.description[:100]}{'...' if len(self.description) > 100 else ''}\n" + f"URL: {self.url}\n" + ) diff --git a/jobfunnel/backend/jobfunnel.py b/jobfunnel/backend/jobfunnel.py index ff1b58b5..113cc8c7 100755 --- a/jobfunnel/backend/jobfunnel.py +++ b/jobfunnel/backend/jobfunnel.py @@ -1,6 +1,7 @@ """Scrapes jobs, applies search filters and writes pickles to master list Paul McInnis 2020 """ + import csv import json import os @@ -230,7 +231,9 @@ def scrape(self) -> Dict[str, Job]: try: incoming_jobs_dict = scraper.scrape() except Exception as e: - self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}") + self.logger.error( + f"Failed to scrape jobs for {scraper_cls.__name__}: {e}" + ) # Ensure we have no duplicates between our scrapers by key-id # (since we are updating the jobs dict with results) @@ -425,6 +428,7 @@ def read_master_csv(self) -> Dict[str, Job]: short_description=short_description, post_date=post_date, scrape_date=scrape_date, + wage=wage, raw=raw, tags=row["tags"].split(","), remoteness=remoteness, diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py index 77084d1f..28ba5d3f 100644 --- a/jobfunnel/backend/scrapers/base.py +++ b/jobfunnel/backend/scrapers/base.py @@ -1,6 +1,7 @@ """The base scraper class to be used for all web-scraping emitting Job objects Paul McInnis 2020 """ + import random from abc import ABC, abstractmethod from concurrent.futures import ThreadPoolExecutor, as_completed diff --git a/jobfunnel/backend/scrapers/glassdoor.py b/jobfunnel/backend/scrapers/glassdoor.py index a8e3d6f7..fa220ba4 100644 --- a/jobfunnel/backend/scrapers/glassdoor.py +++ b/jobfunnel/backend/scrapers/glassdoor.py @@ -1,6 +1,7 @@ """Scraper for www.glassdoor.X FIXME: this is currently unable to get past page 1 of job results. """ + import re from abc import abstractmethod from concurrent.futures import ThreadPoolExecutor, wait diff --git a/jobfunnel/backend/scrapers/indeed.py b/jobfunnel/backend/scrapers/indeed.py index 83444adb..b201c8f0 100644 --- a/jobfunnel/backend/scrapers/indeed.py +++ b/jobfunnel/backend/scrapers/indeed.py @@ -1,10 +1,13 @@ """Scraper designed to get jobs from www.indeed.X """ + import re from concurrent.futures import ThreadPoolExecutor, wait from math import ceil from typing import Any, Dict, List, Optional from unicodedata import normalize +import json +import random from bs4 import BeautifulSoup from requests import Session @@ -20,7 +23,12 @@ ) from jobfunnel.backend.tools.filters import JobFilter from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str -from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Remoteness +from jobfunnel.resources import ( + MAX_CPU_WORKERS, + JobField, + Remoteness, + USER_AGENT_LIST_MOBILE, +) # pylint: disable=using-constant-test,unused-import if False: # or typing.TYPE_CHECKING if python3.5.3+ @@ -28,7 +36,7 @@ # pylint: enable=using-constant-test,unused-import ID_REGEX = re.compile(r"id=\"sj_([a-zA-Z0-9]*)\"") -MAX_RESULTS_PER_INDEED_PAGE = 50 +MAX_RESULTS_PER_INDEED_PAGE = 20 # 20 results for mobile, 50 for desktop # NOTE: these magic strings stick for both the US and CAN indeed websites... FULLY_REMOTE_MAGIC_STRING = "&remotejob=032b3046-06a3-4876-8dfd-474eb5e7ed11" COVID_REMOTE_MAGIC_STRING = "&remotejob=7e3167e4-ccb4-49cb-b761-9bae564a0a63" @@ -41,10 +49,34 @@ } REMOTENESS_STR_MAP = { "remote": Remoteness.FULLY_REMOTE, - "temporarily remote": Remoteness.TEMPORARILY_REMOTE, + "hybrid work": Remoteness.TEMPORARILY_REMOTE, } +def format_taxonomy_attributes(taxonomy_attributes): + result = [] + + # Loop through the taxonomyAttributes list + for category in taxonomy_attributes: + label = category[ + "label" + ] # Get the category label (e.g., "job-types", "benefits") + attributes = category["attributes"] + + # Only process if the attributes list is not empty + if attributes: + # Get all attribute labels within the category + attribute_labels = [attr["label"] for attr in attributes] + # Create a readable string combining the category label and its attributes + formatted_str = ( + f"{label.replace('-', ' ').capitalize()}: {', '.join(attribute_labels)}" + ) + result.append(formatted_str) + + # Join all the formatted strings with a line break or any separator + return result + + class BaseIndeedScraper(BaseScraper): """Scrapes jobs from www.indeed.X""" @@ -60,6 +92,11 @@ def __init__( if self.config.search_config.remoteness == Remoteness.PARTIALLY_REMOTE: self.logger.warning("Indeed does not support PARTIALLY_REMOTE jobs") + @property + def user_agent(self) -> str: + """Get a randomized user agent for this scraper""" + return random.choice(USER_AGENT_LIST_MOBILE) + @property def job_get_fields(self) -> str: """Call self.get(...) for the JobFields in this list when scraping a Job @@ -69,11 +106,12 @@ def job_get_fields(self) -> str: return [ JobField.TITLE, JobField.COMPANY, + JobField.DESCRIPTION, JobField.LOCATION, JobField.KEY_ID, JobField.TAGS, JobField.POST_DATE, - JobField.REMOTENESS, + # JobField.REMOTENESS, JobField.WAGE, ] @@ -86,7 +124,7 @@ def job_set_fields(self) -> str: Override this as needed. """ - return [JobField.RAW, JobField.URL, JobField.DESCRIPTION] + return [JobField.URL, JobField.REMOTENESS] @property def delayed_get_set_fields(self) -> str: @@ -159,47 +197,59 @@ def get_job_soups_from_search_result_listings(self) -> List[BeautifulSoup]: return job_soup_list def get(self, parameter: JobField, soup: BeautifulSoup) -> Any: - """Get a single job attribute from a soup object by JobField""" + """Get a single job attribute from a soup object that was derived from a JSON string.""" + + # Convert BeautifulSoup object back to a dictionary + job_data = json.loads(soup.text) + if parameter == JobField.TITLE: - return soup.find("a", attrs={"data-tn-element": "jobTitle"}).text.strip() + return job_data.get("displayTitle", None) + + elif parameter == JobField.DESCRIPTION: + return job_data.get("snippet", None) + elif parameter == JobField.COMPANY: - return soup.find("span", attrs={"class": "company"}).text.strip() + return job_data.get("company", None) + elif parameter == JobField.LOCATION: - return soup.find("span", attrs={"class": "location"}).text.strip() + return job_data.get("formattedLocation", None) + elif parameter == JobField.TAGS: - # tags may not be on page and that's ok. - table_soup = soup.find("table", attrs={"class": "jobCardShelfContainer"}) - if table_soup: - return [ - td.text.strip() - for td in table_soup.find_all( - "td", attrs={"class": "jobCardShelfItem"} - ) - ] - else: - return [] + + formatted_attributes = format_taxonomy_attributes( + job_data.get("taxonomyAttributes", []) + ) + + return formatted_attributes + elif parameter == JobField.REMOTENESS: - remote_field = soup.find("span", attrs={"class": "remote"}) - if remote_field: - remoteness_str = remote_field.text.strip().lower() - if remoteness_str in REMOTENESS_STR_MAP: - return REMOTENESS_STR_MAP[remoteness_str] - return Remoteness.UNKNOWN + return ( + Remoteness.FULLY_REMOTE + if job_data.get("remoteLocation", False) + else Remoteness.UNKNOWN + ) + elif parameter == JobField.WAGE: - # We may not be able to obtain a wage - potential = soup.find("span", attrs={"class": "salaryText"}) - if potential: - return potential.text.strip() - else: - return "" + salary_info = job_data.get("extractedSalary", None) + if salary_info: + min_salary = salary_info.get("min") + max_salary = salary_info.get("max") + if min_salary and max_salary: + return ( + f"${min_salary} - ${max_salary} {salary_info.get('type', '')}" + ) + else: + return "" + return "" + elif parameter == JobField.POST_DATE: return calc_post_date_from_relative_str( - soup.find("span", attrs={"class": "date"}).text.strip() + job_data.get("formattedRelativeTime", None) ) + elif parameter == JobField.KEY_ID: - return ID_REGEX.findall( - str(soup.find("a", attrs={"class": "sl resultLink save-job-link"})) - )[0] + return job_data.get("jobkey", None) + else: raise NotImplementedError(f"Cannot get {parameter.name}") @@ -211,6 +261,19 @@ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None: job._raw_scrape_data = BeautifulSoup( self.session.get(job.url).text, self.config.bs4_parser ) + + elif parameter == JobField.REMOTENESS: + remoteness = [ + tag.split(":")[-1].strip().lower() + for tag in job.tags + if "remote" in tag.lower() + ] + + if len(remoteness): + job.remoteness = REMOTENESS_STR_MAP.get( + remoteness[0], Remoteness.UNKNOWN + ) + elif parameter == JobField.DESCRIPTION: assert job._raw_scrape_data job.description = job._raw_scrape_data.find( @@ -219,7 +282,7 @@ def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None: elif parameter == JobField.URL: assert job.key_id job.url = ( - f"http://www.indeed.{self.config.search_config.domain}/" + f"https://www.indeed.{self.config.search_config.domain}/m/" f"viewjob?jk={job.key_id}" ) else: @@ -231,7 +294,7 @@ def _get_search_url(self, method: Optional[str] = "get") -> str: """ if method == "get": return ( - "https://www.indeed.{}/jobs?q={}&l={}%2C+{}&radius={}&" + "https://www.indeed.{}/m/jobs?q={}&l={}%2C+{}&radius={}&" "limit={}&filter={}{}".format( self.config.search_config.domain, self.query, @@ -280,17 +343,61 @@ def _get_job_soups_from_search_page( NOTE: Indeed's remoteness filter sucks, and we will always see a mix. ... need to add some kind of filtering for this! """ - url = f"{search}&start={int(page * self.max_results_per_page)}" - job_soup_list.extend( - BeautifulSoup(self.session.get(url).text, self.config.bs4_parser).find_all( - "div", attrs={"data-tn-component": "organicJob"} + url = f"{search}&start={page * self.max_results_per_page}" + + try: + response = self.session.get(url).text + soup = BeautifulSoup(response, self.config.bs4_parser) + + script_tag = soup.find("script", id="mosaic-data") + if not script_tag: + self.logger.warn("No 'mosaic-data' script tag found on the page.") + return + + script_content = script_tag.string + json_regex = re.search( + r'\["mosaic-provider-jobcards"\]\s*=\s*(\{.*?\});', + script_content, + re.DOTALL, + ) + + if json_regex: + json_data_str = json_regex.group(1) + + try: + json_data = json.loads(json_data_str) + job_data = ( + json_data.get("metaData", {}) + .get("mosaicProviderJobCardsModel", {}) + .get("results", []) + ) + + if job_data: + job_data_json = [json.dumps(job) for job in job_data] + job_soup_list.extend( + [ + BeautifulSoup(job_json, "lxml") + for job_json in job_data_json + ] + ) + else: + self.logger.error("No job data found in the JSON structure.") + except json.JSONDecodeError as e: + self.logger.error(f"Error decoding JSON: {e}") + else: + self.logger.error( + "No matching job data found in the script tag content." + ) + + except Exception as e: + self.logger.error( + f"An error occurred while fetching or parsing the page: {e}" ) - ) def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int: """Calculates the number of pages of job listings to be scraped. - i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs + i.e. your search yields 230 results at 20 res/page -> 12 pages of jobs Args: max_pages: the maximum number of pages to be scraped. @@ -300,8 +407,13 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int: # Get the html data, initialize bs4 with lxml request_html = self.session.get(search_url) self.logger.debug("Got Base search results page: %s", search_url) + query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser) - num_res = query_resp.find(id="searchCountPages") + + num_res = query_resp.find( + "div", class_="jobsearch-JobCountAndSortPane-jobCount" + ) + # TODO: we should consider expanding the error cases (scrape error page) if not num_res: raise ValueError( @@ -311,8 +423,15 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int: " province or state.".format(search_url) ) - num_res = num_res.contents[0].strip() - num_res = int(re.findall(r"f (\d+) ", num_res.replace(",", ""))[0]) + num_res_text = num_res.get_text().replace(",", "") + + num_res_match = re.search(r"(\d+)\+?\s+jobs", num_res_text) + + if num_res_match: + num_res = int(num_res_match.group(1)) + else: + num_res = 0 + number_of_pages = int(ceil(num_res / self.max_results_per_page)) if max_pages == 0: return number_of_pages @@ -391,7 +510,7 @@ def _get_search_url(self, method: Optional[str] = "get") -> str: def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int: """Calculates the number of pages of job listings to be scraped. - i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs + i.e. your search yields 230 results at 20 res/page -> 12 pages of jobs Args: max_pages: the maximum number of pages to be scraped. @@ -467,9 +586,12 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int: """ # Get the html data, initialize bs4 with lxml request_html = self.session.get(search_url) - self.logger.debug("Got Base search results page: %s", search_url) + query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser) - num_res = query_resp.find(id="searchCountPages") + num_res = query_resp.find( + "div", class_="jobsearch-JobCountAndSortPane-jobCount" + ) + if not num_res: raise ValueError( "Unable to identify number of pages of results for query: {}" diff --git a/jobfunnel/backend/scrapers/monster.py b/jobfunnel/backend/scrapers/monster.py index 2a60fae4..a92c9250 100644 --- a/jobfunnel/backend/scrapers/monster.py +++ b/jobfunnel/backend/scrapers/monster.py @@ -1,5 +1,6 @@ """Scrapers for www.monster.X """ + import re from abc import abstractmethod from math import ceil diff --git a/jobfunnel/backend/scrapers/registry.py b/jobfunnel/backend/scrapers/registry.py index 5933ba72..2e52d4ea 100644 --- a/jobfunnel/backend/scrapers/registry.py +++ b/jobfunnel/backend/scrapers/registry.py @@ -3,6 +3,7 @@ NOTE: if you implement a scraper you must add it here TODO: there must be a better way to do this by using class attrib of Provider """ + from jobfunnel.resources import Locale, Provider from jobfunnel.backend.scrapers.indeed import ( diff --git a/jobfunnel/backend/tools/delay.py b/jobfunnel/backend/tools/delay.py index afa545a9..a4346117 100644 --- a/jobfunnel/backend/tools/delay.py +++ b/jobfunnel/backend/tools/delay.py @@ -1,5 +1,6 @@ """Module for calculating random or non-random delay """ + from math import ceil, log, sqrt from random import uniform from typing import List, Union diff --git a/jobfunnel/backend/tools/filters.py b/jobfunnel/backend/tools/filters.py index 9269c62b..e107b17d 100644 --- a/jobfunnel/backend/tools/filters.py +++ b/jobfunnel/backend/tools/filters.py @@ -2,6 +2,7 @@ filters to reduce un-necessesary scraping Paul McInnis 2020 """ + import logging from collections import namedtuple from copy import deepcopy diff --git a/jobfunnel/backend/tools/tools.py b/jobfunnel/backend/tools/tools.py index efd068ab..a198d160 100644 --- a/jobfunnel/backend/tools/tools.py +++ b/jobfunnel/backend/tools/tools.py @@ -1,5 +1,6 @@ """Assorted tools for all aspects of funnelin' that don't fit elsewhere """ + import logging import re import sys diff --git a/jobfunnel/config/base.py b/jobfunnel/config/base.py index 2c63bf34..3369dc84 100644 --- a/jobfunnel/config/base.py +++ b/jobfunnel/config/base.py @@ -1,5 +1,6 @@ """Base config object with a validator """ + from abc import ABC, abstractmethod diff --git a/jobfunnel/config/cli.py b/jobfunnel/config/cli.py index 09892a93..1683a93b 100644 --- a/jobfunnel/config/cli.py +++ b/jobfunnel/config/cli.py @@ -1,5 +1,6 @@ """Configuration parsing module for CLI --> JobFunnelConfigManager """ + import argparse from typing import Dict, Any, List import yaml diff --git a/jobfunnel/config/delay.py b/jobfunnel/config/delay.py index d90b468e..c3673a9d 100644 --- a/jobfunnel/config/delay.py +++ b/jobfunnel/config/delay.py @@ -1,5 +1,6 @@ """Simple config object to contain the delay configuration """ + from jobfunnel.config.base import BaseConfig from jobfunnel.resources import DelayAlgorithm from jobfunnel.resources.defaults import ( diff --git a/jobfunnel/config/manager.py b/jobfunnel/config/manager.py index 68d84b95..7d9a8c42 100644 --- a/jobfunnel/config/manager.py +++ b/jobfunnel/config/manager.py @@ -1,5 +1,6 @@ """Config object to run JobFunnel """ + import logging import os from typing import List, Optional diff --git a/jobfunnel/config/proxy.py b/jobfunnel/config/proxy.py index 9045e514..a75ad665 100644 --- a/jobfunnel/config/proxy.py +++ b/jobfunnel/config/proxy.py @@ -1,5 +1,6 @@ """Proxy configuration for Session() """ + import ipaddress from jobfunnel.config import BaseConfig diff --git a/jobfunnel/config/search.py b/jobfunnel/config/search.py index a6da1f0e..c207eabe 100644 --- a/jobfunnel/config/search.py +++ b/jobfunnel/config/search.py @@ -1,5 +1,6 @@ """Object to contain job query metadata """ + from typing import List, Optional from jobfunnel.config import BaseConfig from jobfunnel.resources import Locale, Provider, Remoteness diff --git a/jobfunnel/config/settings.py b/jobfunnel/config/settings.py index 67ecc581..52fedcf5 100644 --- a/jobfunnel/config/settings.py +++ b/jobfunnel/config/settings.py @@ -1,5 +1,6 @@ """Settings YAML Schema w/ validator """ + import ipaddress from cerberus import Validator diff --git a/jobfunnel/resources/defaults.py b/jobfunnel/resources/defaults.py index 5efc5abc..ad39e682 100644 --- a/jobfunnel/resources/defaults.py +++ b/jobfunnel/resources/defaults.py @@ -1,6 +1,7 @@ """Default arguments for both JobFunnelConfigManager and CLI arguments. NOTE: Not all defaults here are used, as we rely on YAML for demo and not kwargs """ + import os from pathlib import Path from jobfunnel.resources.enums import Locale, DelayAlgorithm, Provider, Remoteness diff --git a/jobfunnel/resources/resources.py b/jobfunnel/resources/resources.py index 95b10e75..03ca6e7a 100644 --- a/jobfunnel/resources/resources.py +++ b/jobfunnel/resources/resources.py @@ -1,8 +1,10 @@ """String-like resouces and other constants are initialized here. """ + import datetime import os import string +from pathlib import Path # CSV header for output CSV. do not remove anything or you'll break usr's CSV's # TODO: need to add short and long descriptions (breaking change) @@ -36,13 +38,25 @@ PRINTABLE_STRINGS = set(string.printable) -# Load the user agent list once only. -USER_AGENT_LIST_FILE = os.path.normpath( - os.path.join(os.path.dirname(__file__), "user_agent_list.txt") -) -USER_AGENT_LIST = [] -with open(USER_AGENT_LIST_FILE) as file: - for line in file: - li = line.strip() - if li and not li.startswith("#"): - USER_AGENT_LIST.append(line.rstrip("\n")) + +def load_user_agents(file_path): + """Loads user agent strings from a file, skipping comments and blank lines.""" + try: + with open(file_path, "r") as file: + return [ + line.strip() + for line in file + if line.strip() and not line.startswith("#") + ] + except FileNotFoundError: + print(f"File {file_path} not found.") + return [] + + +# Define the paths +USER_AGENT_LIST_FILE = Path(__file__).parent / "user_agent_list.txt" +USER_AGENT_LIST_MOBILE_FILE = Path(__file__).parent / "user_agent_list_mobile.txt" + +# Load the lists +USER_AGENT_LIST = load_user_agents(USER_AGENT_LIST_FILE) +USER_AGENT_LIST_MOBILE = load_user_agents(USER_AGENT_LIST_MOBILE_FILE) diff --git a/jobfunnel/resources/user_agent_list_mobile.txt b/jobfunnel/resources/user_agent_list_mobile.txt new file mode 100644 index 00000000..524b188d --- /dev/null +++ b/jobfunnel/resources/user_agent_list_mobile.txt @@ -0,0 +1,14 @@ +# iPhone +Mozilla/5.0 (Apple-iPhone7C2/1202.466; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3 Indeed App 225.0 +Mozilla/5.0 (iPhone9,4; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1 Indeed App 225.0 +Mozilla/5.0 (iPhone9,3; U; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1 Indeed App 225.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A5370a Safari/604.1 Indeed App 225.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Indeed App 225.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1 Indeed App 225.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/13.2b11866 Mobile/16A366 Safari/605.1.15 Indeed App 225.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1 Indeed App 225.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1 Indeed App 225.0 +Mozilla/5.0 (iPhone12,1; U; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1 Indeed App 225.0 +Mozilla/5.0 (iPhone13,2; U; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1 Indeed App 225.0 +Mozilla/5.0 (iPhone14,3; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19A346 Safari/602.1 Indeed App 225.0 +Mozilla/5.0 (iPhone14,6; U; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19E241 Safari/602.1 Indeed App 225.0 \ No newline at end of file diff --git a/readme.md b/readme.md index 362b9625..e66e4098 100644 --- a/readme.md +++ b/readme.md @@ -3,8 +3,6 @@ Automated tool for scraping job postings into a `.csv` file. -_[Since this project was developed, CAPTCHA has clamped down hard, help us re-build the backend and make this tool useful again!](https://github.com/PaulMcInnis/JobFunnel/discussions/148)_ - ### Benefits over job search sites: * Never see the same job twice! @@ -16,7 +14,7 @@ _[Since this project was developed, CAPTCHA has clamped down hard, help us re-bu # Installation -_JobFunnel requires [Python][python] 3.8 or later._ +_JobFunnel requires [Python][python] 3.11 or later._ ``` pip install git+https://github.com/PaulMcInnis/JobFunnel.git @@ -113,5 +111,5 @@ Open the master CSV file and update the per-job `status`: [cron]:https://en.wikipedia.org/wiki/Cron [cron_doc]:docs/crontab/readme.md [conc_fut]:https://docs.python.org/dev/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor -[thread]: https://docs.python.org/3.8/library/threading.html +[thread]: https://docs.python.org/3.11/library/threading.html [delay_jp]:https://github.com/bunsenmurder/Notebooks/blob/master/jobFunnel/delay_algorithm.ipynb diff --git a/requirements.txt b/requirements.txt index f93c4ee9..1ecd9c95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ selenium>=3.141.0 webdriver-manager>=2.4.0 Cerberus>=1.3.2 tqdm>=4.47.0 +black>=24.8.0 +pre-commit>=3.8.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 8eef2c20..a9ef2c90 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ author_email="paulmcinnis99@gmail.com", url=url, license="MIT License", - python_requires=">=3.8.0", + python_requires=">=3.11", install_requires=requires, packages=find_packages(exclude=("tests", "docs", "images")), include_package_data=True, @@ -47,6 +47,6 @@ classifiers=[ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.11", ], ) diff --git a/tests/config/test_cli.py b/tests/config/test_cli.py index a1e1a23f..893b85b3 100644 --- a/tests/config/test_cli.py +++ b/tests/config/test_cli.py @@ -1,5 +1,6 @@ """Test CLI parsing --> config dict """ + import os import pytest from jobfunnel.config import parse_cli, build_config_dict diff --git a/tests/config/test_delay.py b/tests/config/test_delay.py index b05dd9e7..e5befac1 100644 --- a/tests/config/test_delay.py +++ b/tests/config/test_delay.py @@ -1,5 +1,6 @@ """Test the DelayConfig """ + import pytest from jobfunnel.config import DelayConfig diff --git a/tests/config/test_search.py b/tests/config/test_search.py index c323fb94..2e929af1 100644 --- a/tests/config/test_search.py +++ b/tests/config/test_search.py @@ -1,5 +1,6 @@ """Test the search config """ + import pytest from jobfunnel.config import SearchConfig