Merge branch 'main' into github_actions

open-austin · Sep 15, 2024 · 599210d · 599210d
2 parents 5c7d42d + 66f1f46
commit 599210d
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 3 deletions.
diff --git a/src/cleaner/__init__.py b/src/cleaner/__init__.py
@@ -1,12 +1,19 @@
-import json, argparse, os, datetime as dt, xxhash
-from azure.cosmos import CosmosClient, exceptions
-from dotenv import load_dotenv
+import json
+import os
+import datetime as dt
+import xxhash
 
 class Cleaner:
 
     def __init__(self, county):
         self.county = county.lower()
 
+    def add_parsing_date(self, input_dict: dict, out_file: dict) -> dict:
+        # This will add the date of parsing to the final cleaned json file
+        today_date = dt.datetime.today().strftime('%Y-%m-%d')
+        out_file['parsing_date'] = today_date
+        return out_file
+
     def clean(self):
 
         case_json_folder_path = os.path.join(
@@ -112,6 +119,9 @@ def contains_good_motion(motion, event):
             def_atty_hash = xxhash.xxh64(str(def_atty_unique_str)).hexdigest()
             out_file["defense attorney"] = def_atty_hash
 
+            # This adds the date of parsing to the final cleaned json
+            out_file = self.add_parsing_date(input_dict, out_file)
+
             # Original Format
             out_filepath = os.path.join(
             os.path.dirname(__file__), "..", "..", "data", self.county, "case_json_cleaned",case_json

diff --git a/src/scraper/README.md b/src/scraper/README.md
@@ -0,0 +1,21 @@
+```mermaid
+graph TD
+
+  A[scrape] --> B[set_defaults: Initialize default values for parameters like county, wait time, dates, and case details]
+  B --> C[configure_logger: Set up logging for the scraping process]
+  C --> D[format_county: Normalize the county name to ensure consistent processing]
+  D --> E[create_session: Create a web session object for handling HTTP requests]
+  E --> F[make_directories: Create directories for storing scraped case data, if not already provided]
+  F --> G[get_ody_link: Retrieve base URL and Odyssey version information based on county]
+  G --> H[scrape_main_page: Fetch and parse the main page of the county's court site]
+  G <--> O[county_csv]
+  H --> I[scrape_search_page: Navigate to the search page and extract relevant content]
+  I --> J[get_hidden_values: Extract hidden form values required for subsequent searches]
+
+  J --> K{Is case_number provided?}
+  K -- Yes --> L[scrape_individual_case: Scrape data for a specific case number provided by the user]
+  L --> Q[county-specific scraper]
+  K -- No --> M[scrape_jo_list: Retrieve a list of judicial officers between the start and end dates]
+  M --> N[scrape_multiple_cases: Scrape data for multiple cases based on judicial officers and date range]
+  N -- loop through Judicial Officers per Day in Range --> R[county-specific scraper]
+```