Skip to content

Commit

Permalink
Merge pull request #101 from open-austin/scraper-modulation
Browse files Browse the repository at this point in the history
Scraper modulation
  • Loading branch information
nicolassaw authored Sep 7, 2024
2 parents 56a06a7 + 89678c0 commit 91f21ae
Show file tree
Hide file tree
Showing 9 changed files with 549 additions and 497 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ Source: https://docs.python.org/3/library/venv.html#how-venvs-work

Note: Again, you'll need to activate venv _every time you want to work in the codebase_.

If the above doesn't work, try these instructions for creating and activating a virtual environment:
1. Navigate to your project directory: cd [insert file path]
2. Create a virtual environenment: python -m venv venv
3. Activate the virtual environment: .\venv\Scripts\activate.bat

### Install python dependencies

Using `pip`, install the project dependencies.
Expand Down
2 changes: 1 addition & 1 deletion src/cleaner/__main__.py → src/cleaner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from azure.cosmos import CosmosClient, exceptions
from dotenv import load_dotenv

class cleaner:
class Cleaner:

def __init__(self, county):
self.county = county.lower()
Expand Down
12 changes: 6 additions & 6 deletions src/orchestrator/__main__.py → src/orchestrator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@
sys.path.append(parent_dir)

# Import all of the programs modules within the parent_dir
from scraper.__main__ import scraper
from parser.__main__ import parser
from cleaner.__main__ import cleaner
from updater.__main__ import updater
from scraper import scraper
from parser import parser
from cleaner import cleaner
from updater import updater

class orchestrator:
def __init__(self):
#Sets our base parameters
self.counties = []
self.start_date = '2024-07-01' #Update start date here
self.end_date = '2024-07-01' #Update start date here
def orchestrate(self):
def orchestrate(self, test):

#This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated.
with open(
Expand All @@ -35,7 +35,7 @@ def orchestrate(self):
#This runs the different modules in order
for c in self.counties:
print(f"Starting to scrape, parse, clean, and update this county: {c}")
scraper(county = c).scrape() #src/scraper
scraper(test = test, county = c).scrape() #src/scraper
parser(c).parse() #src/parser
cleaner(c).clean() #src/cleaner
updater(c).update() #src/updater
Expand Down
5 changes: 3 additions & 2 deletions src/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def get_class_and_method(self, county):
return None, None

def get_directories(self, county, test):
#TODO: Check for dependencies. Raise if county is missing.
if not test:
case_html_path = os.path.join(
os.path.dirname(__file__), "..", "..", "data", county, "case_html"
Expand Down Expand Up @@ -158,10 +159,10 @@ def parse(self, county, case_number, test): #remove the test value here and just
# Handle the case where parser_instance or parser_function is None
print("Error: Could not obtain parser instance or function.")

#Adds county field to data
# Adds county field to data
case_data['county'] = county

#Adds a hash to the JSON file of the underlying HTML
# Adds a hash to the JSON file of the underlying HTML
body = case_soup.find("body")
balance_table = body.find_all("table")[-1]
if "Balance Due" in balance_table.text:
Expand Down
Loading

0 comments on commit 91f21ae

Please sign in to comment.