Skip to content

Commit

Permalink
Merge pull request #4594 from freelawproject/update_cap_command
Browse files Browse the repository at this point in the history
Update cap crosswalk command
  • Loading branch information
mlissner authored Oct 18, 2024
2 parents a0c1465 + 11fe9ac commit 8e4aac4
Showing 1 changed file with 33 additions and 12 deletions.
45 changes: 33 additions & 12 deletions cl/search/management/commands/generate_cap_crosswalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
from botocore.exceptions import ClientError
from django.conf import settings
from django.core.management.base import BaseCommand
from eyecite.find import get_citations
from eyecite.models import FullCaseCitation
from tqdm import tqdm

from cl.lib.command_utils import CommandUtils
Expand Down Expand Up @@ -61,6 +59,11 @@ def add_arguments(self, parser):
help="Directory to save crosswalk files",
required=True,
)
parser.add_argument(
"--start-from-reporter",
type=str,
help="Process starting from this reporter slug",
)

def handle(self, *args: Any, **options: Any) -> None:
if options["verbose"]:
Expand All @@ -70,6 +73,7 @@ def handle(self, *args: Any, **options: Any) -> None:
self.single_reporter = options["reporter"]
self.single_volume = options["volume"]
self.output_dir = options["output_dir"]
self.start_from_reporter = options["start_from_reporter"]

if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
Expand Down Expand Up @@ -111,6 +115,27 @@ def generate_complete_crosswalk(self) -> None:
r for r in reporters if r["short_name"] == self.single_reporter
]

if self.start_from_reporter:
reporter_item_index = next(
(
index
for index, item in enumerate(reporters)
if item["slug"] == self.start_from_reporter
),
None,
)
if reporter_item_index:
logger.info(
f"Starting from reporter: {self.start_from_reporter}"
)
reporters = reporters[reporter_item_index:]
self.start_from_reporter = None
else:
# Invalid reporter slug
raise ValueError(
f"Invalid reporter slug to start from: {self.start_from_reporter}"
)

for i, reporter in enumerate(
tqdm(reporters, desc="Processing reporters")
):
Expand Down Expand Up @@ -164,10 +189,6 @@ def generate_crosswalk_for_reporter(
logger.info(
f"Match found: CAP ID {case_meta['id']} -> CL ID {cl_case.id}"
)
else:
logger.info(
f"No match found for CAP ID {case_meta['id']}"
)
else:
logger.warning(
f"Invalid case metadata for CAP ID {case_meta['id']}"
Expand All @@ -181,7 +202,7 @@ def generate_crosswalk_for_reporter(
)

logger.info(
f"Processed {self.total_cases_processed} cases for {reporter_name}, found {self.total_matches_found} matches"
f"Processed {self.total_cases_processed} cases for {reporter_name}({reporter_slug}), found {self.total_matches_found} matches"
)

def fetch_volumes_for_reporter(self, reporter_slug: str) -> List[str]:
Expand Down Expand Up @@ -235,17 +256,17 @@ def find_matching_case(
cap_case_id = str(case_meta["id"])
page = str(case_meta["first_page"])

query = f"{reporter_slug}.{volume}/{page}.{cap_case_id}"
query = f"law.free.cap.{reporter_slug}.{volume}/{page}.{cap_case_id}.json"
logger.debug(f"Searching for: {query}")

# Exact match of the file path in this format, e.g.:
# law.free.cap.wis-2d.369/658.6776082.json
matching_cluster = OpinionCluster.objects.filter(
filepath_json_harvard__icontains=query
filepath_json_harvard=query
).first()

if matching_cluster:
logger.info(
f"Match found: CAP ID {cap_case_id} -> CL ID {matching_cluster.id}"
)
# Match found, return object
return matching_cluster
else:
logger.info(
Expand Down

0 comments on commit 8e4aac4

Please sign in to comment.