Feature/updating indeed scraper (#166) (#170)

* Feature/updating indeed scraper (#166) * - Updated to mobile endpoints and user agents to prevent CAPTCHA - Updated parsing of indeed scraper - Fixed tags not being parsed correctly - Fixed remoteness not being parsed correctly - Changed to only scrape the first page of each search by default for speed * - Updated method of loading user agent files - Updated user agent file of indeed scraper * - Updated versions in requirements.txt - Added in black configuration file for formatting - Added a pre-commit hook so all contributors will have consistent formatting on upload - Updated all python files to conform to black formatter * Updated Python version * More black formatting updates * - Added prettierrc and prettierignore - Formatted all files other than python * Updated prettierignore so prettier can search through subdirectories * Reset formatting to longer line width * Reverted to previous commit * Updating again to longer line width after accounting for missing files * Updated prettierrc and prettierignore files and reran formatting * Updated version * - Reverted Markdown changes - Reverted settings_USA changes - Updated readme - Removed extra user-agent from phone user agents list - Removed extra comments * Changed readme to refer to python 3.11 instead of 3.8, and added the mobile user agent list to the MANIFEST.in
PaulMcInnis · Sep 17, 2024 · 72faea2 · 72faea2
1 parent edf149b
commit 72faea2
Show file tree

Hide file tree

Showing 30 changed files with 286 additions and 74 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 24.8.0 # Replace this with the version of Black you want to use
+    hooks:
+      - id: black
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: "v3.1.0" # Specify Prettier version
+    hooks:
+      - id: prettier
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,6 @@
 include jobfunnel/demo/settings.yaml
 include jobfunnel/demo/demo.png
 include jobfunnel/resources/user_agent_list.txt
+include jobfunnel/resources/user_agent_list_mobile.txt
 include readme.md
 include LICENSE
diff --git a/demo/settings_USA.yaml b/demo/settings_USA.yaml
@@ -22,7 +22,6 @@ search:
   # FIXME: we need to add back GLASSDOOR when that's working again.
   providers:
     - INDEED
-    - MONSTER
 
   # Region that we are searching for jobs within:
   province_or_state: "Texas" # NOTE: this is generally 2 characters long.

diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
@@ -1,3 +1,4 @@
 """JobFunnel base package init, we keep module version here.
 """
-__version__ = "3.0.2"
+
+__version__ = "4.0.0"
diff --git a/jobfunnel/backend/job.py b/jobfunnel/backend/job.py
@@ -1,6 +1,7 @@
 """Base Job class to be populated by Scrapers, manipulated by Filters and saved
 to csv / etc by Exporter
 """
+
 from copy import deepcopy
 from datetime import date, datetime
 from typing import Dict, List, Optional
@@ -132,7 +133,7 @@ def update_if_newer(self, job: "Job") -> bool:
         Returns:
             True if we updated self with job, False if we didn't
         """
-        if job.post_date > self.post_date:
+        if job.post_date >= self.post_date:
             # Update all attrs other than status (which user can set).
             self.company = deepcopy(job.company)
             self.location = deepcopy(job.location)
@@ -152,6 +153,7 @@ def update_if_newer(self, job: "Job") -> bool:
             # pylint: disable=protected-access
             self._raw_scrape_data = deepcopy(job._raw_scrape_data)
             # pylint: enable=protected-access
+
             return True
         else:
             return False
@@ -187,7 +189,7 @@ def as_row(self) -> Dict[str, str]:
                         self.location,
                         self.post_date.strftime("%Y-%m-%d"),
                         self.description,
-                        ", ".join(self.tags),
+                        "\n".join(self.tags),
                         self.url,
                         self.key_id,
                         self.provider,
@@ -210,9 +212,11 @@ def as_json_entry(self) -> Dict[str, str]:
             "title": self.title,
             "company": self.company,
             "post_date": self.post_date.strftime("%Y-%m-%d"),
-            "description": (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
-            if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
-            else (self.description),
+            "description": (
+                (self.description[:MAX_BLOCK_LIST_DESC_CHARS] + "..")
+                if len(self.description) > MAX_BLOCK_LIST_DESC_CHARS
+                else (self.description)
+            ),
             "status": self.status.name,
         }
 
@@ -243,3 +247,29 @@ def validate(self) -> None:
         assert self.url, "URL is unset!"
         if len(self.description) < MIN_DESCRIPTION_CHARS:
             raise ValueError("Description too short!")
+
+    def __repr__(self) -> str:
+        """Developer-friendly representation of the Job object."""
+        return (
+            f"Job("
+            f"title='{self.title}', "
+            f"company='{self.company}', "
+            f"location='{self.location}', "
+            f"status={self.status.name}, "
+            f"post_date={self.post_date}, "
+            f"url='{self.url}')"
+        )
+
+    def __str__(self) -> str:
+        """Human-readable string representation of the Job object."""
+        return (
+            f"Job Title: {self.title}\n"
+            f"Company: {self.company}\n"
+            f"Location: {self.location}\n"
+            f"Post Date: {self.post_date.strftime('%Y-%m-%d') if self.post_date else 'N/A'}\n"
+            f"Status: {self.status.name}\n"
+            f"Wage: {self.wage if self.wage else 'N/A'}\n"
+            f"Remoteness: {self.remoteness if self.remoteness else 'N/A'}\n"
+            f"Description (truncated): {self.description[:100]}{'...' if len(self.description) > 100 else ''}\n"
+            f"URL: {self.url}\n"
+        )
diff --git a/jobfunnel/backend/jobfunnel.py b/jobfunnel/backend/jobfunnel.py
@@ -1,6 +1,7 @@
 """Scrapes jobs, applies search filters and writes pickles to master list
 Paul McInnis 2020
 """
+
 import csv
 import json
 import os
@@ -230,7 +231,9 @@ def scrape(self) -> Dict[str, Job]:
             try:
                 incoming_jobs_dict = scraper.scrape()
             except Exception as e:
-                self.logger.error(f"Failed to scrape jobs for {scraper_cls.__name__}")
+                self.logger.error(
+                    f"Failed to scrape jobs for {scraper_cls.__name__}: {e}"
+                )
 
             # Ensure we have no duplicates between our scrapers by key-id
             # (since we are updating the jobs dict with results)
@@ -425,6 +428,7 @@ def read_master_csv(self) -> Dict[str, Job]:
                     short_description=short_description,
                     post_date=post_date,
                     scrape_date=scrape_date,
+                    wage=wage,
                     raw=raw,
                     tags=row["tags"].split(","),
                     remoteness=remoteness,

diff --git a/jobfunnel/backend/scrapers/base.py b/jobfunnel/backend/scrapers/base.py
@@ -1,6 +1,7 @@
 """The base scraper class to be used for all web-scraping emitting Job objects
 Paul McInnis 2020
 """
+
 import random
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor, as_completed

diff --git a/jobfunnel/backend/scrapers/glassdoor.py b/jobfunnel/backend/scrapers/glassdoor.py
@@ -1,6 +1,7 @@
 """Scraper for www.glassdoor.X
 FIXME: this is currently unable to get past page 1 of job results.
 """
+
 import re
 from abc import abstractmethod
 from concurrent.futures import ThreadPoolExecutor, wait