Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/updating indeed scraper (#166) #170

Merged
merged 3 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
repos:
- repo: https://github.com/psf/black
rev: 24.8.0 # Replace this with the version of Black you want to use
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-prettier
rev: "v3.1.0" # Specify Prettier version
hooks:
- id: prettier
1 change: 0 additions & 1 deletion demo/settings_USA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ search:
# FIXME: we need to add back GLASSDOOR when that's working again.
providers:
- INDEED
- MONSTER

# Region that we are searching for jobs within:
province_or_state: "Texas" # NOTE: this is generally 2 characters long.
Expand Down
3 changes: 2 additions & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""JobFunnel base package init, we keep module version here.
"""
__version__ = "3.0.2"

__version__ = "4.0.0"
40 changes: 35 additions & 5 deletions jobfunnel/backend/job.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Base Job class to be populated by Scrapers, manipulated by Filters and saved
to csv / etc by Exporter
"""

from copy import deepcopy
from datetime import date, datetime
from typing import Dict, List, Optional
Expand Down Expand Up @@ -132,7 +133,7 @@ def update_if_newer(self, job: "Job") -> bool:
Returns:
True if we updated self with job, False if we didn't
"""
if job.post_date > self.post_date:
if job.post_date >= self.post_date:
# Update all attrs other than status (which user can set).
self.company = deepcopy(job.company)
self.location = deepcopy(job.location)
Expand All @@ -152,6 +153,7 @@ def update_if_newer(self, job: "Job") -> bool:
# pylint: disable=protected-access
self._raw_scrape_data = deepcopy(job._raw_scrape_data)
# pylint: enable=protected-access

return True
else:
return False
Expand Down Expand Up @@ -187,7 +189,7 @@ def as_row(self) -> Dict[str, str]:
self.location,
self.post_date.strftime("%Y-%m-%d"),
self.description,
", ".join(self.tags),
"\n".join(self.tags),
self.url,
self.key_id,
self.provider,
Expand All @@ -210,9 +212,11 @@ def as_json_entry(self) -> Dict[str, str]:
"title": self.title,
"company": self.company,
"post_date": self.post_date.strftime("%Y-%m-%d"),
"description": (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
else (self.description),
"description": (
(self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
else (self.description)
),
"status": self.status.name,
}

Expand Down Expand Up @@ -243,3 +247,29 @@ def validate(self) -> None:
assert self.url, "URL is unset!"
if len(self.description) < MIN_DESCRIPTION_CHARS:
raise ValueError("Description too short!")

def __repr__(self) -> str:
"""Developer-friendly representation of the Job object."""
return (
f"Job("
f"title='{self.title}', "
f"company='{self.company}', "
f"location='{self.location}', "
f"status={self.status.name}, "
f"post_date={self.post_date}, "
f"url='{self.url}')"
)

def __str__(self) -> str:
"""Human-readable string representation of the Job object."""
return (
f"Job Title: {self.title}\n"
f"Company: {self.company}\n"
f"Location: {self.location}\n"
f"Post Date: {self.post_date.strftime('%Y-%m-%d') if self.post_date else 'N/A'}\n"
f"Status: {self.status.name}\n"
f"Wage: {self.wage if self.wage else 'N/A'}\n"
f"Remoteness: {self.remoteness if self.remoteness else 'N/A'}\n"
f"Description (truncated): {self.description[:100]}{'...' if len(self.description) > 100 else ''}\n"
f"URL: {self.url}\n"
)
6 changes: 5 additions & 1 deletion jobfunnel/backend/jobfunnel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Scrapes jobs, applies search filters and writes pickles to master list
Paul McInnis 2020
"""

import csv
import json
import os
Expand Down Expand Up @@ -230,7 +231,9 @@ def scrape(self) -> Dict[str, Job]:
try:
incoming_jobs_dict = scraper.scrape()
except Exception as e:
self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}")
self.logger.error(
f"Failed to scrape jobs for {scraper_cls.__name__}: {e}"
)

# Ensure we have no duplicates between our scrapers by key-id
# (since we are updating the jobs dict with results)
Expand Down Expand Up @@ -425,6 +428,7 @@ def read_master_csv(self) -> Dict[str, Job]:
short_description=short_description,
post_date=post_date,
scrape_date=scrape_date,
wage=wage,
raw=raw,
tags=row["tags"].split(","),
remoteness=remoteness,
Expand Down
1 change: 1 addition & 0 deletions jobfunnel/backend/scrapers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""The base scraper class to be used for all web-scraping emitting Job objects
Paul McInnis 2020
"""

import random
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand Down
1 change: 1 addition & 0 deletions jobfunnel/backend/scrapers/glassdoor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Scraper for www.glassdoor.X
FIXME: this is currently unable to get past page 1 of job results.
"""

import re
from abc import abstractmethod
from concurrent.futures import ThreadPoolExecutor, wait
Expand Down
Loading