Skip to content

Commit

Permalink
Merge branch 'main' into scraper-cleanup-updates
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolassaw authored Sep 21, 2024
2 parents d639fae + 45e3bf6 commit 71e35a3
Show file tree
Hide file tree
Showing 15 changed files with 408 additions and 26 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @ids-core
45 changes: 45 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Indigent Defense Stats

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v3
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with unittest
run: |
SKIP_SLOW=true python -m unittest discover -v -s ./src/tester
- name: Build documentation
run: |
sphinx-build -b html docs build
- uses: actions/upload-pages-artifact@v3.0.1
with:
path: build/
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,5 @@ data/
.~lock.*

.DS_Store

docs/generated
31 changes: 31 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import sys
import os
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'indigent-defense-stats'
copyright = '2024, Open Austin'
author = 'Open Austin'

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

sys.path.insert(0, os.path.abspath('../src'))

extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary']

templates_path = ['_templates']
exclude_patterns = ['src/tester']



# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'alabaster'
html_static_path = ['_static']
21 changes: 21 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.. indigent-defense-stats documentation master file, created by
sphinx-quickstart on Sun Sep 15 15:44:02 2024.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
indigent-defense-stats documentation
====================================

Add your content using ``reStructuredText`` syntax. See the
`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
documentation for details.

.. autosummary::
:toctree: generated

cleaner
orchestrator
parser
scraper
tools
updater
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ python-dotenv == 1.0.1
requests == 2.32.3
retry == 0.9.2
statistics == 1.0.3.5
xxhash == 3.5.0
xxhash == 3.5.0
flake8 == 7.1.0
Sphinx == 8.0.2
6 changes: 6 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from . import cleaner
from . import orchestrator
from . import parser
from . import scraper
from . import tools
from . import updater
100 changes: 100 additions & 0 deletions src/cleaner/Data Structure of Cleaned JSON.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
## Data Structure of the Cleaned Cases JSON

```mermaid
graph TB
subgraph CaseInformation[Case Information Summary]
style CaseInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
A1[County: Hays]
A2[Cause Number Hash: dsqn91cn1odmo]
A3[Odyssey ID: Redacted]
A4[Date Filed: 01/01/2015]
A5[Location: 22nd District Court]
A6[Version: 1]
A7[Parsing Date: 2024-01-01]
end
subgraph PartyInformation[Party Information]
style PartyInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
subgraph DefendantInfoBox[Defendant Info]
style DefendantInfoBox fill:#b0d4f1,stroke:#333,stroke-width:2px
D8[Defendant Info: Redacted]
end
subgraph RepresentationInfo[Defense Attorney Info]
style RepresentationInfo fill:#b0d4f1,stroke:#333,stroke-width:2px
B1[Defense Attorney Hash: 9083bb693e33919c]
B2[Appointed or Retained: Court Appointed]
end
end
subgraph Events[Event Information]
style Events fill:#d3a8e2,stroke:#333,stroke-width:2px
subgraph EvidenceofRep[Representation Evidence]
style EvidenceofRep fill:#b0d4f1,stroke:#333,stroke-width:2px
B3[Has Evidence of Representation: No]
end
end
subgraph ChargeInformation[Charge Information]
style ChargeInformation fill:#d3a8e2,stroke:#333,stroke-width:2px
subgraph Charge1[Aggravated Assault with a Deadly Weapon]
style Charge1 fill:#b0d4f1,stroke:#333,stroke-width:2px
C1[Statute: 22.02a2]
C2[Level: Second Degree Felony]
C3[Date: 10/25/2015]
C4[Charge Name: Aggravated Assault with a Deadly Weapon]
C5[Description: Aggravated Assault]
C6[Category: Violent]
C7[UCCS Code: 1200]
end
subgraph Charge2[Resisting Arrest]
style Charge2 fill:#b0d4f1,stroke:#333,stroke-width:2px
C8[Statute: 38.03]
C9[Level: Class A Misdemeanor]
C10[Date: 10/25/2015]
C11[Charge Name: Resisting Arrest]
C12[Description: Resisting Arrest]
end
E3[Charges Dismissed: 1]
end
subgraph TopCharge[Top Charge]
style TopCharge fill:#b0d4f1,stroke:#333,stroke-width:2px
E1[Charge Name: Aggravated Assault with a Deadly Weapon]
E2[Charge Level: Second Degree Felony]
end
subgraph Dispositions[Dispositions]
style Dispositions fill:#d3a8e2,stroke:#333,stroke-width:2px
subgraph Disposition1[Disposition Details]
style Disposition1 fill:#b0d4f1,stroke:#333,stroke-width:2px
D1[Date: 12/06/2016]
D2[Event: Disposition]
D3[Outcome: Deferred Adjudication]
D4[Sentence Length: 1 Year]
end
subgraph Disposition2[Resisting Arrest Disposition]
style Disposition2 fill:#b0d4f1,stroke:#333,stroke-width:2px
D5[Date: 12/06/2016]
D6[Event: Disposition]
D7[Outcome: Dismissed]
end
end
CaseInformation --> PartyInformation
CaseInformation --> ChargeInformation
CaseInformation --> Dispositions
CaseInformation --> Events
ChargeInformation --> TopCharge
```
16 changes: 13 additions & 3 deletions src/cleaner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import json, argparse, os, datetime as dt, xxhash
from azure.cosmos import CosmosClient, exceptions
from dotenv import load_dotenv
import json
import os
import datetime as dt
import xxhash

class Cleaner:

def __init__(self, county):
self.county = county.lower()

def add_parsing_date(self, input_dict: dict, out_file: dict) -> dict:
# This will add the date of parsing to the final cleaned json file
today_date = dt.datetime.today().strftime('%Y-%m-%d')
out_file['parsing_date'] = today_date
return out_file

def clean(self):

case_json_folder_path = os.path.join(
Expand Down Expand Up @@ -112,6 +119,9 @@ def contains_good_motion(motion, event):
def_atty_hash = xxhash.xxh64(str(def_atty_unique_str)).hexdigest()
out_file["defense attorney"] = def_atty_hash

# This adds the date of parsing to the final cleaned json
out_file = self.add_parsing_date(input_dict, out_file)

# Original Format
out_filepath = os.path.join(
os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned",case_json
Expand Down
20 changes: 8 additions & 12 deletions src/orchestrator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
import sys, os, csv

# Appends the parent directory of this handler script to the sys.path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# Import all of the programs modules within the parent_dir
from scraper import scraper
from parser import parser
from cleaner import cleaner
from updater import updater
import scraper
import parser
import cleaner
import updater

class orchestrator:
class Orchestrator:
def __init__(self):
#Sets our base parameters
self.counties = []
self.start_date = '2024-07-01' #Update start date here
self.end_date = '2024-07-01' #Update start date here
def orchestrate(self, test):
def orchestrate(self, test: bool = False):

#This open the county data CSV to see which counties should be scraped, parsed, cleaned, and updated.
with open(
Expand All @@ -41,4 +36,5 @@ def orchestrate(self, test):
updater(c).update() #src/updater
print(f"Completed with scraping, parsing, cleaning, and updating of this county: {c}")

orchestrator().orchestrate()
if __name__ == '__main__':
Orchestrator().orchestrate()
Loading

0 comments on commit 71e35a3

Please sign in to comment.