Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Setup github actions #146

Merged
merged 18 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Indigent Defense Stats

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v3
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with unittest
run: |
SKIP_SLOW=true python -m unittest discover -v -s ./src/tester
- name: Build documentation
run: |
sphinx-build -b html docs build
- uses: actions/upload-pages-artifact@v3.0.1
with:
path: build/
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,5 @@ data/
.~lock.*

.DS_Store

docs/generated
31 changes: 31 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import sys
import os
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'indigent-defense-stats'
copyright = '2024, Open Austin'
author = 'Open Austin'

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

sys.path.insert(0, os.path.abspath('../src'))

extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary']

templates_path = ['_templates']
exclude_patterns = ['src/tester']



# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'alabaster'
html_static_path = ['_static']
21 changes: 21 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.. indigent-defense-stats documentation master file, created by
sphinx-quickstart on Sun Sep 15 15:44:02 2024.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
indigent-defense-stats documentation
====================================

Add your content using ``reStructuredText`` syntax. See the
`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
documentation for details.

.. autosummary::
:toctree: generated

cleaner
orchestrator
parser
scraper
tools
updater
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ python-dotenv == 1.0.1
requests == 2.32.3
retry == 0.9.2
statistics == 1.0.3.5
xxhash == 3.5.0
xxhash == 3.5.0
flake8 == 7.1.0
Sphinx == 8.0.2
510 changes: 255 additions & 255 deletions resources/texas_county_data.csv

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from . import cleaner
from . import orchestrator
from . import parser
from . import scraper
from . import tools
from . import updater
20 changes: 8 additions & 12 deletions src/orchestrator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
import sys, os, csv

# Appends the parent directory of this handler script to the sys.path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# Import all of the programs modules within the parent_dir
from scraper import scraper
from parser import parser
from cleaner import cleaner
from updater import updater
import scraper
import parser
import cleaner
import updater

class orchestrator:
class Orchestrator:
def __init__(self):
#Sets our base parameters
self.counties = []
self.start_date = '2024-07-01' #Update start date here
self.end_date = '2024-07-01' #Update start date here
def orchestrate(self, test):
def orchestrate(self, test: bool = False):

#This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated.
with open(
Expand All @@ -41,4 +36,5 @@ def orchestrate(self, test):
updater(c).update() #src/updater
print(f"Completed with scraping, parsing, cleaning, and updating of this county: {c}")

orchestrator().orchestrate()
if __name__ == '__main__':
Orchestrator().orchestrate()
27 changes: 17 additions & 10 deletions src/parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def get_directories(self, county, test):

def get_list_of_html(self, case_html_path, case_number, county, test):
# This will loop through the html in the folder they were scraped to.
os.makedirs(case_html_path, exist_ok=True)
case_html_list = os.listdir(case_html_path)

# However, if an optional case number is passed to the function, then read in the case number html file from the data folder
Expand Down Expand Up @@ -104,15 +105,16 @@ def write_json_data(self, case_json_path, case_number, case_data, test):
file_handle.write(json.dumps(case_data))

def write_error_log(self, county, case_number):
basepath = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"data",
county,
)
os.makedirs(basepath, exist_ok=True)
with open(
os.path.join(
os.path.dirname(__file__),
"..",
"..",
"data",
county,
"cases_with_parsing_error.txt",
),
os.path.join(basepath, "cases_with_parsing_error.txt"),
"w",
) as file_handle:
file_handle.write(case_number + "\n")
Expand Down Expand Up @@ -148,8 +150,13 @@ def parse(self, county, case_number, test): #remove the test value here and just
case_html_file_path = self.get_html_path(case_html_path, case_html_file_name, case_number, test)

print(f"{case_number} - parsing")
with open(case_html_file_path, "r") as file_handle:
case_soup = BeautifulSoup(file_handle, "html.parser", from_encoding="UTF-8")
# strip out invalid utf-8 characters
with open(case_html_file_path, "r", encoding='utf-8', errors='ignore') as file_handle:
try:
case_soup = BeautifulSoup(file_handle, "html.parser", from_encoding="UTF-8")
except Exception as e:
print(f'error building beautiful soup for file {case_html_file_path}, {e}')
raise e

# Get the county-specific parser class and method
parser_instance, parser_function = self.get_class_and_method(county=county)
Expand Down
1 change: 1 addition & 0 deletions src/scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import importlib

class Scraper:
"""Scrape Odyssey html files into an output folder"""
def __init__(self):
pass

Expand Down
3 changes: 3 additions & 0 deletions src/tester/test_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from cleaner import Cleaner
from updater import Updater

SKIP_SLOW = os.getenv('SKIP_SLOW', 'false').lower().strip() == 'true'

def log(message, level='INFO'): # Provide message and info level (optional, defaulting to info)
# configure the logger
log = logging.getLogger(name="pid: " + str(os.getpid()))
Expand Down Expand Up @@ -252,6 +254,7 @@ def test_scrape_results_page(self,
#def scrape_case_data_pre2017()
#def scrape_case_data_post2017()

@unittest.skipIf(SKIP_SLOW, "slow")
def test_scrape_multiple_cases(self,
county = 'hays',
odyssey_version = 2003,
Expand Down
Loading