Skip to content

Commit

Permalink
Feature/updating indeed scraper (#166) (#170)
Browse files Browse the repository at this point in the history
* Feature/updating indeed scraper (#166)

* - Updated to mobile endpoints and user agents to prevent CAPTCHA
- Updated parsing of indeed scraper
- Fixed tags not being parsed correctly
- Fixed remoteness not being parsed correctly
- Changed to only scrape the first page of each search by default for speed

* - Updated method of loading user agent files
- Updated user agent file of indeed scraper

* - Updated versions in requirements.txt
- Added in black configuration file for formatting
- Added a pre-commit hook so all contributors will have consistent
  formatting on upload
- Updated all python files to conform to black formatter

* Updated Python version

* More black formatting updates

* - Added prettierrc and prettierignore
- Formatted all files other than python

* Updated prettierignore so prettier can search through subdirectories

* Reset formatting to longer line width

* Reverted to previous commit

* Updating again to longer line width after accounting for missing files

* Updated prettierrc and prettierignore files and reran formatting

* Updated version

* - Reverted Markdown changes
- Reverted settings_USA changes
- Updated readme
- Removed extra user-agent from phone user agents list
- Removed extra comments

* Changed readme to refer to python 3.11 instead of 3.8, and added the mobile user agent list to the MANIFEST.in
  • Loading branch information
sammytheindi authored Sep 17, 2024
1 parent edf149b commit 72faea2
Show file tree
Hide file tree
Showing 30 changed files with 286 additions and 74 deletions.
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
repos:
- repo: https://github.com/psf/black
rev: 24.8.0 # Replace this with the version of Black you want to use
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v3.1.0" # Specify Prettier version
hooks:
- id: prettier
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include jobfunnel/demo/settings.yaml
include jobfunnel/demo/demo.png
include jobfunnel/resources/user_agent_list.txt
include jobfunnel/resources/user_agent_list_mobile.txt
include readme.md
include LICENSE
1 change: 0 additions & 1 deletion demo/settings_USA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ search:
# FIXME: we need to add back GLASSDOOR when that's working again.
providers:
- INDEED
- MONSTER

# Region that we are searching for jobs within:
province_or_state: "Texas" # NOTE: this is generally 2 characters long.
Expand Down
3 changes: 2 additions & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""JobFunnel base package init, we keep module version here.
"""
__version__ = "3.0.2"

__version__ = "4.0.0"
40 changes: 35 additions & 5 deletions jobfunnel/backend/job.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Base Job class to be populated by Scrapers, manipulated by Filters and saved
to csv / etc by Exporter
"""

from copy import deepcopy
from datetime import date, datetime
from typing import Dict, List, Optional
Expand Down Expand Up @@ -132,7 +133,7 @@ def update_if_newer(self, job: "Job") -> bool:
Returns:
True if we updated self with job, False if we didn't
"""
if job.post_date > self.post_date:
if job.post_date >= self.post_date:
# Update all attrs other than status (which user can set).
self.company = deepcopy(job.company)
self.location = deepcopy(job.location)
Expand All @@ -152,6 +153,7 @@ def update_if_newer(self, job: "Job") -> bool:
# pylint: disable=protected-access
self._raw_scrape_data = deepcopy(job._raw_scrape_data)
# pylint: enable=protected-access

return True
else:
return False
Expand Down Expand Up @@ -187,7 +189,7 @@ def as_row(self) -> Dict[str, str]:
self.location,
self.post_date.strftime("%Y-%m-%d"),
self.description,
", ".join(self.tags),
"\n".join(self.tags),
self.url,
self.key_id,
self.provider,
Expand All @@ -210,9 +212,11 @@ def as_json_entry(self) -> Dict[str, str]:
"title": self.title,
"company": self.company,
"post_date": self.post_date.strftime("%Y-%m-%d"),
"description": (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
else (self.description),
"description": (
(self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
else (self.description)
),
"status": self.status.name,
}

Expand Down Expand Up @@ -243,3 +247,29 @@ def validate(self) -> None:
assert self.url, "URL is unset!"
if len(self.description) < MIN_DESCRIPTION_CHARS:
raise ValueError("Description too short!")

def __repr__(self) -> str:
"""Developer-friendly representation of the Job object."""
return (
f"Job("
f"title='{self.title}', "
f"company='{self.company}', "
f"location='{self.location}', "
f"status={self.status.name}, "
f"post_date={self.post_date}, "
f"url='{self.url}')"
)

def __str__(self) -> str:
"""Human-readable string representation of the Job object."""
return (
f"Job Title: {self.title}\n"
f"Company: {self.company}\n"
f"Location: {self.location}\n"
f"Post Date: {self.post_date.strftime('%Y-%m-%d') if self.post_date else 'N/A'}\n"
f"Status: {self.status.name}\n"
f"Wage: {self.wage if self.wage else 'N/A'}\n"
f"Remoteness: {self.remoteness if self.remoteness else 'N/A'}\n"
f"Description (truncated): {self.description[:100]}{'...' if len(self.description) > 100 else ''}\n"
f"URL: {self.url}\n"
)
6 changes: 5 additions & 1 deletion jobfunnel/backend/jobfunnel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Scrapes jobs, applies search filters and writes pickles to master list
Paul McInnis 2020
"""

import csv
import json
import os
Expand Down Expand Up @@ -230,7 +231,9 @@ def scrape(self) -> Dict[str, Job]:
try:
incoming_jobs_dict = scraper.scrape()
except Exception as e:
self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}")
self.logger.error(
f"Failed to scrape jobs for {scraper_cls.__name__}: {e}"
)

# Ensure we have no duplicates between our scrapers by key-id
# (since we are updating the jobs dict with results)
Expand Down Expand Up @@ -425,6 +428,7 @@ def read_master_csv(self) -> Dict[str, Job]:
short_description=short_description,
post_date=post_date,
scrape_date=scrape_date,
wage=wage,
raw=raw,
tags=row["tags"].split(","),
remoteness=remoteness,
Expand Down
1 change: 1 addition & 0 deletions jobfunnel/backend/scrapers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""The base scraper class to be used for all web-scraping emitting Job objects
Paul McInnis 2020
"""

import random
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand Down
1 change: 1 addition & 0 deletions jobfunnel/backend/scrapers/glassdoor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Scraper for www.glassdoor.X
FIXME: this is currently unable to get past page 1 of job results.
"""

import re
from abc import abstractmethod
from concurrent.futures import ThreadPoolExecutor, wait
Expand Down
Loading

0 comments on commit 72faea2

Please sign in to comment.