Skip to content

Commit

Permalink
Merge branch 'main' into github_actions
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt343 authored Sep 15, 2024
2 parents 5c7d42d + 66f1f46 commit 599210d
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 3 deletions.
16 changes: 13 additions & 3 deletions src/cleaner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import json, argparse, os, datetime as dt, xxhash
from azure.cosmos import CosmosClient, exceptions
from dotenv import load_dotenv
import json
import os
import datetime as dt
import xxhash

class Cleaner:

def __init__(self, county):
self.county = county.lower()

def add_parsing_date(self, input_dict: dict, out_file: dict) -> dict:
# This will add the date of parsing to the final cleaned json file
today_date = dt.datetime.today().strftime('%Y-%m-%d')
out_file['parsing_date'] = today_date
return out_file

def clean(self):

case_json_folder_path = os.path.join(
Expand Down Expand Up @@ -112,6 +119,9 @@ def contains_good_motion(motion, event):
def_atty_hash = xxhash.xxh64(str(def_atty_unique_str)).hexdigest()
out_file["defense attorney"] = def_atty_hash

# This adds the date of parsing to the final cleaned json
out_file = self.add_parsing_date(input_dict, out_file)

# Original Format
out_filepath = os.path.join(
os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned",case_json
Expand Down
21 changes: 21 additions & 0 deletions src/scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
```mermaid
graph TD
A[scrape] --> B[set_defaults: Initialize default values for parameters like county, wait time, dates, and case details]
B --> C[configure_logger: Set up logging for the scraping process]
C --> D[format_county: Normalize the county name to ensure consistent processing]
D --> E[create_session: Create a web session object for handling HTTP requests]
E --> F[make_directories: Create directories for storing scraped case data, if not already provided]
F --> G[get_ody_link: Retrieve base URL and Odyssey version information based on county]
G --> H[scrape_main_page: Fetch and parse the main page of the county's court site]
G <--> O[county_csv]
H --> I[scrape_search_page: Navigate to the search page and extract relevant content]
I --> J[get_hidden_values: Extract hidden form values required for subsequent searches]
J --> K{Is case_number provided?}
K -- Yes --> L[scrape_individual_case: Scrape data for a specific case number provided by the user]
L --> Q[county-specific scraper]
K -- No --> M[scrape_jo_list: Retrieve a list of judicial officers between the start and end dates]
M --> N[scrape_multiple_cases: Scrape data for multiple cases based on judicial officers and date range]
N -- loop through Judicial Officers per Day in Range --> R[county-specific scraper]
```

0 comments on commit 599210d

Please sign in to comment.