From b4c1cae2c278c07589855c16aad0975b0413ee1e Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:04:57 -0400 Subject: [PATCH 01/67] feat(scrapers): Removes Solr indexing for cloned items --- .../management/commands/clone_from_cl.py | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/cl/scrapers/management/commands/clone_from_cl.py b/cl/scrapers/management/commands/clone_from_cl.py index bc16ad0ecf..f1311e2b59 100644 --- a/cl/scrapers/management/commands/clone_from_cl.py +++ b/cl/scrapers/management/commands/clone_from_cl.py @@ -37,13 +37,6 @@ manage.py clone_from_cl --type people_db.Person --id 4173 --clone-person-positions manage.py clone_from_cl --type search.Docket --id 5377675 --clone-person-positions -Also, you can decide whether the cloned objects should be indexed in solr or not, -this only applies for OpinionCluster and Docket objects (In the future this will need -to be replaced with elasticsearch), for example: - -manage.py clone_from_cl --type search.OpinionCluster --id 1814616 --add-to-solr - - This is still work in progress, some data is not cloned yet. """ @@ -67,7 +60,6 @@ from cl.audio.models import Audio from cl.people_db.models import Person from cl.search.models import Citation, Court, Docket, Opinion, RECAPDocument -from cl.search.tasks import add_items_to_solr VALID_TYPES = ( "search.OpinionCluster", @@ -117,7 +109,6 @@ def clone_opinion_cluster( cluster_ids: list, download_cluster_files: bool, add_docket_entries: bool, - add_to_solr: bool = False, person_positions: bool = False, object_type="search.OpinionCluster", ): @@ -129,7 +120,6 @@ def clone_opinion_cluster( :param download_cluster_files: True if it should download cluster files :param add_docket_entries: flag to clone docket entries and recap docs :param person_positions: True if we should clone person positions - :param add_to_solr: True if we should add objects to solr :param object_type: OpinionCluster app name with model name :return: list of opinion cluster objects """ @@ -168,7 +158,6 @@ def clone_opinion_cluster( False, False, person_positions, - add_to_solr, )[0] citation_data = cluster_datum["citations"] panel_data = cluster_datum["panel"] @@ -334,16 +323,6 @@ def clone_opinion_cluster( reverse("view_case", args=[opinion_cluster.pk, docket.slug]), ) - if add_to_solr: - # Add opinions to search engine - add_items_to_solr.delay(added_opinions_ids, "search.Opinion") - - if add_to_solr: - # Add opinion clusters to search engine - add_items_to_solr.delay( - [oc.pk for oc in opinion_clusters], "search.OpinionCluster" - ) - return opinion_clusters @@ -354,7 +333,6 @@ def clone_docket( add_audio_files: bool, add_clusters: bool, person_positions: bool = False, - add_to_solr: bool = False, object_type="search.Docket", ): """Download docket data from courtlistener.com and add it to local @@ -369,7 +347,6 @@ def clone_docket( cloning a docket :param person_positions: True is we should clone person positions :param person_positions: True is we should clone person positions - :param add_to_solr: True if we should add objects to solr :param object_type: Docket app name with model name :return: list of docket objects """ @@ -492,10 +469,6 @@ def clone_docket( ), ) - if add_to_solr: - # Add dockets to search engine - add_items_to_solr.delay([doc.pk for doc in dockets], "search.Docket") - return dockets @@ -870,7 +843,6 @@ def clone_person( session: Session, people_ids: list, positions=False, - add_to_solr: bool = False, object_type="people_db.Person", ): """Download person data from courtlistener.com and add it to local @@ -879,7 +851,6 @@ def clone_person( :param session: a Requests session :param people_ids: a list of person ids :param positions: True if we should clone person positions - :param add_to_solr: True if we should add objects to solr :param object_type: Person app name with model name :return: list of person objects """ @@ -964,12 +935,6 @@ def clone_person( with transaction.atomic(): clone_position(session, position_ids, person_id) - if add_to_solr: - # Add people to search engine - add_items_to_solr.delay( - [person.pk for person in people], "people_db.Person" - ) - return people @@ -1068,7 +1033,6 @@ def __init__(self, *args, **kwargs): self.add_docket_entries = False self.add_audio_files = False self.clone_person_positions = False - self.add_to_solr = False self.s = requests.session() self.s.headers = { @@ -1138,20 +1102,12 @@ def add_arguments(self, parser): "calls.", ) - parser.add_argument( - "--add-to-solr", - action="store_true", - default=False, - help="Add cloned objects to solr search engine.", - ) - def handle(self, *args, **options): self.type = options.get("type") self.ids = options.get("ids") self.download_cluster_files = options.get("download_cluster_files") self.add_docket_entries = options.get("add_docket_entries") self.clone_person_positions = options.get("clone_person_positions") - self.add_to_solr = options.get("add_to_solr") if not os.environ.get("CL_API_TOKEN"): self.stdout.write("Error: CL_API_TOKEN not set in .env file") @@ -1171,7 +1127,6 @@ def handle(self, *args, **options): self.download_cluster_files, self.add_docket_entries, self.clone_person_positions, - self.add_to_solr, self.type, ) case "search.Docket": @@ -1182,7 +1137,6 @@ def handle(self, *args, **options): options["add_audio_files"], options["add_clusters"], self.clone_person_positions, - self.add_to_solr, self.type, ) case "people_db.Person": @@ -1190,7 +1144,6 @@ def handle(self, *args, **options): self.s, self.ids, self.clone_person_positions, - self.add_to_solr, self.type, ) case "search.Court": From cb1b45dba2637b2f0ec63951437f61a9c0c24658 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:23:27 -0400 Subject: [PATCH 02/67] feat(search): Removes command for indexing data to Solr --- .../management/commands/cl_update_index.py | 404 ------------------ 1 file changed, 404 deletions(-) delete mode 100644 cl/search/management/commands/cl_update_index.py diff --git a/cl/search/management/commands/cl_update_index.py b/cl/search/management/commands/cl_update_index.py deleted file mode 100644 index ec7978fbfb..0000000000 --- a/cl/search/management/commands/cl_update_index.py +++ /dev/null @@ -1,404 +0,0 @@ -import ast -import sys -from typing import Iterable - -from django.apps import apps -from django.conf import settings -from requests import Session - -from cl.lib.argparse_types import valid_date_time -from cl.lib.celery_utils import CeleryThrottle -from cl.lib.command_utils import VerboseCommand -from cl.lib.scorched_utils import ExtraSolrInterface -from cl.lib.timer import print_timing -from cl.people_db.models import Person -from cl.search.models import Docket -from cl.search.tasks import add_items_to_solr, delete_items - -VALID_OBJ_TYPES = ( - "audio.Audio", - "people_db.Person", - "search.Opinion", - "search.RECAPDocument", - "search.Docket", -) - - -def proceed_with_deletion(out, count, noinput): - """ - Checks whether we want to proceed to delete (lots of) items - """ - if noinput: - return True - - proceed = True - out.write("\n") - yes_or_no = input( - f"WARNING: Are you **sure** you want to delete all {count} items? [y/N] " - ) - out.write("\n") - if not yes_or_no.lower().startswith("y"): - out.write("No action taken.\n") - proceed = False - - if count > 10000 and proceed is True: - # Double check...something might be off. - yes_or_no = input( - "Are you sure? There are an awful lot of items here? [y/N] " - ) - if not yes_or_no.lower().startswith("y"): - out.write("No action taken.\n") - proceed = False - - return proceed - - -class Command(VerboseCommand): - help = ( - "Adds, updates, deletes items in an index, committing changes and " - "optimizing it, if requested." - ) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.solr_url = None - self.si = None - self.verbosity = None - self.options = [] - self.type = None - self.noinput = None - - def add_arguments(self, parser): - parser.add_argument( - "--type", - type=str, - choices=VALID_OBJ_TYPES, - help="Because the Solr indexes are loosely bound to the database, " - "commands require that the correct model is provided in this " - "argument. Current choices are %s" % ", ".join(VALID_OBJ_TYPES), - ) - parser.add_argument( - "--solr-url", - type=str, - help="When swapping cores, it can be valuable to use a temporary " - "Solr URL, overriding the default value that's in the " - "settings, e.g., http://127.0.0.1:8983/solr/swap_core", - ) - parser.add_argument( - "--noinput", - action="store_true", - help="Do NOT prompt the user for input of any kind. Useful in " - "tests, but can disable important warnings.", - ) - parser.add_argument( - "--queue", - type=str, - default="celery", - help="The celery queue where the tasks should be processed.", - ) - - actions_group = parser.add_mutually_exclusive_group() - actions_group.add_argument( - "--update", - action="store_true", - default=False, - help="Run the command in update mode. Use this to add or update " - "items.", - ) - actions_group.add_argument( - "--delete", - action="store_true", - default=False, - help="Run the command in delete mode. Use this to remove items " - "from the index. Note that this will not delete items from " - "the index that do not continue to exist in the database.", - ) - - parser.add_argument( - "--optimize", - action="store_true", - default=False, - help="Run the optimize command against the current index after " - "any updates or deletions are completed.", - ) - parser.add_argument( - "--optimize-everything", - action="store_true", - default=False, - help="Optimize all indexes that are registered with Solr.", - ) - parser.add_argument( - "--do-commit", - action="store_true", - default=False, - help="Performs a simple commit and nothing more.", - ) - - act_upon_group = parser.add_mutually_exclusive_group() - act_upon_group.add_argument( - "--everything", - action="store_true", - default=False, - help="Take action on everything in the database", - ) - act_upon_group.add_argument( - "--query", - type=str, - help="Take action on items fulfilling a query. Queries should be " - "formatted as Python dicts such as: \"{'court_id':'haw'}\"", - ) - act_upon_group.add_argument( - "--items", - type=int, - nargs="*", - help="Take action on a list of items using a single " - "Celery task", - ) - act_upon_group.add_argument( - "--datetime", - type=valid_date_time, - help="Take action on items newer than a date (YYYY-MM-DD) or a " - "date and time (YYYY-MM-DD HH:MM:SS)", - ) - - parser.add_argument( - "--start-at", - type=int, - default=0, - help="For use with the --everything flag, skip this many items " - "before starting the processing.", - ) - - def handle(self, *args, **options): - super().handle(*args, **options) - self.verbosity = int(options.get("verbosity", 1)) - self.options = options - self.noinput = options["noinput"] - if not self.options["optimize_everything"]: - self.solr_url = options["solr_url"] - self.si = ExtraSolrInterface(self.solr_url, mode="rw") - self.type = options["type"] - - if options["update"]: - if self.verbosity >= 1: - self.stdout.write("Running in update mode...\n") - if options.get("everything"): - self.add_or_update_all() - elif options.get("datetime"): - self.add_or_update_by_datetime(options["datetime"]) - elif options.get("query"): - self.stderr.write("Updating by query not implemented.") - sys.exit(1) - elif options.get("items"): - self.add_or_update(*options["items"]) - - elif options.get("delete"): - if self.verbosity >= 1: - self.stdout.write("Running in deletion mode...\n") - if options.get("everything"): - self.delete_all() - elif options.get("datetime"): - self.delete_by_datetime(options["datetime"]) - elif options.get("query"): - self.delete_by_query(options["query"]) - elif options.get("items"): - self.delete(*options["items"]) - - if options.get("do_commit"): - self.si.commit() - - if options.get("optimize"): - self.optimize() - - if options.get("optimize_everything"): - self.optimize_everything() - - self.si.conn.http_connection.close() - if not any( - [ - options["update"], - options.get("delete"), - options.get("do_commit"), - options.get("optimize"), - options.get("optimize_everything"), - ] - ): - self.stderr.write( - "Error: You must specify whether you wish to " - "update, delete, commit, or optimize your " - "index.\n" - ) - sys.exit(1) - - def process_queryset(self, iterable: Iterable, count: int) -> None: - """Chunks the queryset passed in, and dispatches it to Celery for - adding to the index. - - :param iterable: An iterable of items to add to Solr. - :param count: The number of items that will be processed. - """ - # The count to send in a single Celery task - chunk_size = 100 - - queue = self.options["queue"] - start_at = self.options["start_at"] - # Set low throttle. Higher values risk crashing Redis. - throttle = CeleryThrottle(queue_name=queue) - processed_count = 0 - chunk = [] - for item in iterable: - processed_count += 1 - if processed_count < start_at: - continue - last_item = count == processed_count - chunk.append(item) - if processed_count % chunk_size == 0 or last_item: - throttle.maybe_wait() - add_items_to_solr.apply_async( - args=(chunk, self.type), queue=queue - ) - chunk = [] - sys.stdout.write( - "\rProcessed {}/{} ({:.0%})".format( - processed_count, count, processed_count * 1.0 / count - ) - ) - self.stdout.flush() - self.stdout.write("\n") - - @print_timing - def delete(self, items): - """ - Given a list of items, delete them. - """ - self.stdout.write(f"Deleting items(s): {items}\n") - delete_items.delay(items, self.type) - - def delete_all(self): - """ - Deletes all items from the index. - """ - count = self.si.query("*").add_extra(caller="cl_update_index").count() - - if proceed_with_deletion(self.stdout, count, self.noinput): - self.stdout.write( - "Removing all items from your index because you said so.\n" - ) - self.stdout.write(" Marking all items as deleted...\n") - self.si.delete_all() - self.stdout.write(" Committing the deletion...\n") - self.si.commit() - self.stdout.write( - f"\nDone. The index located at: {self.solr_url}\nis now empty.\n" - ) - - @print_timing - def delete_by_datetime(self, dt): - """ - Given a datetime, deletes all items in the index newer than that time. - - Relies on the items still being in the database. - """ - model = apps.get_model(self.type) - qs = ( - model.objects.filter(date_created__gt=dt) - .order_by() - .values_list("pk", flat=True) - ) - count = qs.count() - if proceed_with_deletion(self.stdout, count, self.noinput): - self.stdout.write(f"Deleting all item(s) newer than {dt}\n") - self.si.delete(list(qs)) - self.si.commit() - - @print_timing - def delete_by_query(self, query): - """ - Given a query, deletes all the items that match that query. - """ - query_dict = ast.literal_eval(query) - count = self.si.query(self.si.Q(**query_dict)).count() - if proceed_with_deletion(self.stdout, count, self.noinput): - self.stdout.write( - f"Deleting all item(s) that match the query: {query}\n" - ) - self.si.delete(queries=self.si.Q(**query_dict)) - self.si.commit() - - @print_timing - def add_or_update(self, *items): - """ - Given an item, adds it to the index, or updates it if it's already - in the index. - """ - self.stdout.write(f"Adding or updating item(s): {list(items)}\n") - add_items_to_solr(items, self.type) - - @print_timing - def add_or_update_by_datetime(self, dt): - """ - Given a datetime, adds or updates all items newer than that time. - """ - self.stdout.write(f"Adding or updating items(s) newer than {dt}\n") - model = apps.get_model(self.type) - qs = ( - model.objects.filter(date_created__gte=dt) - .order_by() - .values_list("pk", flat=True) - ) - count = qs.count() - qs = qs.iterator() - self.process_queryset(qs, count) - - @print_timing - def add_or_update_all(self): - """ - Iterates over the entire corpus, adding it to the index. Can be run on - an empty index or an existing one. - - If run on an existing index, existing items will be updated, but no - items will be deleted. - """ - self.stdout.write("Adding or updating all items...\n") - model = apps.get_model(self.type) - if model == Person: - q = model.objects.filter(is_alias_of=None).prefetch_related( - "positions" - ) - # Filter out non-judges -- they don't get searched. - q = [item.pk for item in q if item.is_judge] - count = len(q) - elif model == Docket: - q = Docket.objects.filter( - source__in=Docket.RECAP_SOURCES() - ).values_list("pk", flat=True) - count = q.count() - q = q.iterator() - else: - q = model.objects.values_list("pk", flat=True) - count = q.count() - q = q.iterator() - self.process_queryset(q, count) - - @print_timing - def optimize(self): - """Runs the Solr optimize command.""" - self.stdout.write("Optimizing the index...") - self.si.optimize() - self.stdout.write("done.\n") - - @print_timing - def optimize_everything(self): - """Run the optimize command on all indexes.""" - urls = set(settings.SOLR_URLS.values()) - self.stdout.write(f"Found {len(urls)} indexes. Optimizing...\n") - with Session() as session: - for url in urls: - self.stdout.write(f" - {url}\n") - try: - si = ExtraSolrInterface(url, http_connection=session) - except EnvironmentError: - self.stderr.write(" Couldn't load schema!") - continue - si.optimize() - self.stdout.write("Done.\n") From 563e15a5f335e73e109e77095c2866cd07e2f44e Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:24:22 -0400 Subject: [PATCH 03/67] feat(admin): Remove custom Solr indexing methods in admin classes --- cl/people_db/admin.py | 50 -------------------------------- cl/search/admin.py | 66 ------------------------------------------- 2 files changed, 116 deletions(-) diff --git a/cl/people_db/admin.py b/cl/people_db/admin.py index 70d7e90204..18ab755a3e 100644 --- a/cl/people_db/admin.py +++ b/cl/people_db/admin.py @@ -26,7 +26,6 @@ School, Source, ) -from cl.search.tasks import add_items_to_solr, delete_items class RetentionEventInline(admin.TabularInline): @@ -50,25 +49,6 @@ class PositionAdmin(admin.ModelAdmin): "person__name_first", ) - def save_model( - self, - request: HttpRequest, - obj: Position, - form: ModelForm, - change: bool, - ) -> None: - obj.save() - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([obj.person_id], "people_db.Person") - - def delete_model(self, request: HttpRequest, obj: Position) -> None: - # Update the person to remove the position from them. - obj.delete() - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([obj.person_id], "people_db.Person") - class PositionInline(admin.StackedInline): model = Position @@ -153,36 +133,6 @@ class PersonAdmin(admin.ModelAdmin, AdminTweaksMixin): readonly_fields = ("has_photo",) actions = ("update_in_solr", "delete_from_solr") - def save_model(self, request, obj, form, change): - obj.save() - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([obj.pk], "people_db.Person") - - def delete_model(self, request, obj): - obj.delete() - from cl.search.tasks import delete_items - - delete_items.delay([obj.pk], "people_db.Person") - - @admin.action(description="Update selected people in Solr") - def update_in_solr(self, request: HttpRequest, queryset: QuerySet) -> None: - add_items_to_solr.delay([p.pk for p in queryset], "people_db.Person") - self.message_user( - request, - f"Successfully updated {queryset.count()} people in Solr", - ) - - @admin.action(description="Delete selected people from Solr") - def delete_from_solr( - self, request: HttpRequest, queryset: QuerySet - ) -> None: - delete_items.delay([p.pk for p in queryset], "people_db.Person") - self.message_user( - request, - f"Successfully deleted {queryset.count()} people from Solr", - ) - @admin.register(Race) class RaceAdmin(admin.ModelAdmin): diff --git a/cl/search/admin.py b/cl/search/admin.py index 1fbe71cfdb..0c40e787ac 100644 --- a/cl/search/admin.py +++ b/cl/search/admin.py @@ -26,7 +26,6 @@ RECAPDocument, SearchQuery, ) -from cl.search.tasks import add_items_to_solr @admin.register(Opinion) @@ -47,18 +46,6 @@ class OpinionAdmin(CursorPaginatorAdmin): "date_modified", ) - def save_model(self, request, obj, form, change): - obj.save() - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([obj.pk], "search.Opinion") - - def delete_model(self, request, obj): - obj.delete() - from cl.search.tasks import delete_items - - delete_items.delay([obj.pk], "search.Opinion") - @admin.register(Citation) class CitationAdmin(CursorPaginatorAdmin): @@ -99,12 +86,6 @@ class OpinionClusterAdmin(CursorPaginatorAdmin): "date_created", ) - def save_model(self, request, obj, form, change): - obj.save() - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([obj.pk], "search.OpinionCluster") - @admin.register(Court) class CourtAdmin(admin.ModelAdmin): @@ -203,11 +184,6 @@ def seal_documents(self, request: HttpRequest, queryset: QuerySet) -> None: ocr_status=None, ) - # Update solr - add_items_to_solr.delay( - [rd.pk for rd in queryset], "search.RECAPDocument" - ) - # Do a CloudFront invalidation invalidate_cloudfront([f"/{path}" for path in deleted_filepaths]) @@ -236,11 +212,6 @@ class RECAPDocumentInline(admin.StackedInline): ) raw_id_fields = ("tags",) - # Essential so that we remove sealed content from Solr when updating it via - # admin interface. - def save_model(self, request, obj, form, change): - obj.save(index=True) - @admin.register(DocketEntry) class DocketEntryAdmin(CursorPaginatorAdmin): @@ -289,37 +260,6 @@ class DocketAdmin(CursorPaginatorAdmin): "parent_docket", ) - def save_model( - self, - request: HttpRequest, - obj: Docket, - form: ModelForm, - change: bool, - ) -> None: - obj.save() - from cl.search.tasks import add_items_to_solr - - ids = list( - RECAPDocument.objects.filter( - docket_entry__docket_id=obj.pk, - ).values_list("id", flat=True) - ) - add_items_to_solr.delay(ids, "search.RECAPDocument") - - def delete_model(self, request: HttpRequest, obj: Docket) -> None: - # Do the query before deleting the item. Otherwise, the query returns - # nothing. - ids = list( - RECAPDocument.objects.filter( - docket_entry__docket_id=obj.pk - ).values_list("id", flat=True) - ) - - from cl.search.tasks import delete_items - - delete_items.delay(ids, "search.RECAPDocument") - obj.delete() - @admin.register(OpinionsCited) class OpinionsCitedAdmin(CursorPaginatorAdmin): @@ -329,12 +269,6 @@ class OpinionsCitedAdmin(CursorPaginatorAdmin): ) search_fields = ("=citing_opinion__id",) - def save_model(self, request, obj, form, change): - obj.save() - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([obj.citing_opinion_id], "search.Opinion") - @admin.register(Parenthetical) class ParentheticalAdmin(CursorPaginatorAdmin): From 5e557989aac257c139aca6d3e26d8a2919d2f33f Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:24:57 -0400 Subject: [PATCH 04/67] feat(models): Removes Solr indexing from model classes --- cl/audio/models.py | 18 ------------------ cl/recap_rss/models.py | 9 +-------- cl/search/models.py | 37 ------------------------------------- 3 files changed, 1 insertion(+), 63 deletions(-) diff --git a/cl/audio/models.py b/cl/audio/models.py index f1e87d2402..992c3a311f 100644 --- a/cl/audio/models.py +++ b/cl/audio/models.py @@ -212,24 +212,6 @@ def save( # type: ignore[override] indexing it? """ super().save(*args, **kwargs) # type: ignore - if index: - from cl.search.tasks import add_items_to_solr - - add_items_to_solr([self.pk], "audio.Audio", force_commit) - - def delete( # type: ignore[override] - self, - *args: List, - **kwargs: Dict, - ) -> None: - """ - Update the index as items are deleted. - """ - id_cache = self.pk - super().delete(*args, **kwargs) # type: ignore - from cl.search.tasks import delete_items - - delete_items.delay([id_cache], "audio.Audio") def as_search_dict(self) -> Dict[str, Union[int, List[int], str]]: """Create a dict that can be ingested by Solr""" diff --git a/cl/recap_rss/models.py b/cl/recap_rss/models.py index 4a2375d4be..1fb7f9eb81 100644 --- a/cl/recap_rss/models.py +++ b/cl/recap_rss/models.py @@ -111,14 +111,7 @@ def reprocess_item( when doing medata only since no entries are modified). """ from cl.recap_rss.tasks import merge_rss_feed_contents - from cl.search.tasks import add_items_to_solr rss_feed = PacerRssFeed(map_cl_to_pacer_id(self.court_id)) rss_feed._parse_text(self.file_contents) - response = merge_rss_feed_contents( - rss_feed.data, self.court_id, metadata_only - ) - if index: - add_items_to_solr( - response.get("rds_for_solr", []), "search.RECAPDocument" - ) + merge_rss_feed_contents(rss_feed.data, self.court_id, metadata_only) diff --git a/cl/search/models.py b/cl/search/models.py index e4c13e116b..191900bad7 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -1635,12 +1635,7 @@ def save( from cl.scrapers.tasks import extract_recap_pdf tasks.append(extract_recap_pdf.si(self.pk)) - if index: - from cl.search.tasks import add_items_to_solr - tasks.append( - add_items_to_solr.si([self.pk], "search.RECAPDocument") - ) if len(tasks) > 0: chain(*tasks)() @@ -1680,17 +1675,6 @@ def clean(self): logger.error(msg) raise ValidationError({"attachment_number": msg}) - def delete(self, *args, **kwargs): - """ - Note that this doesn't get called when an entire queryset - is deleted, but that should be OK. - """ - id_cache = self.pk - super().delete(*args, **kwargs) - from cl.search.tasks import delete_items - - delete_items.delay([id_cache], "search.RECAPDocument") - def get_docket_metadata(self): """The metadata for the item that comes from the Docket.""" docket = self.docket_entry.docket @@ -3084,12 +3068,6 @@ def save( if update_fields is not None: update_fields = {"slug"}.union(update_fields) super().save(update_fields=update_fields, *args, **kwargs) - if index: - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay( - [self.pk], "search.OpinionCluster", force_commit - ) async def asave( self, @@ -3107,17 +3085,6 @@ async def asave( **kwargs, ) - def delete(self, *args, **kwargs): - """ - Note that this doesn't get called when an entire queryset - is deleted, but that should be OK. - """ - id_cache = self.pk - super().delete(*args, **kwargs) - from cl.search.tasks import delete_items - - delete_items.delay([id_cache], "search.Opinion") - def as_search_list(self): # IDs out = {} @@ -3589,10 +3556,6 @@ def save( ) -> None: self.clean() super().save(*args, **kwargs) - if index: - from cl.search.tasks import add_items_to_solr - - add_items_to_solr.delay([self.pk], "search.Opinion", force_commit) def as_search_dict(self) -> Dict[str, Any]: """Create a dict that can be ingested by Solr.""" From 3aa77f8d96c9c940aa9c6d07178e2f3f75b09d4d Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:32:41 -0400 Subject: [PATCH 05/67] feat(corpus_importer): Removes Solr indexing logic from commands --- cl/corpus_importer/management/commands/760_project.py | 3 --- cl/corpus_importer/management/commands/adelman_david.py | 3 --- cl/corpus_importer/management/commands/anon_2020_import.py | 4 ---- cl/corpus_importer/management/commands/buchwald_project.py | 2 -- .../management/commands/buried_alive_project.py | 2 -- cl/corpus_importer/management/commands/everything_project.py | 2 -- cl/corpus_importer/management/commands/harvard_opinions.py | 4 ---- cl/corpus_importer/management/commands/import_patent.py | 3 --- cl/corpus_importer/management/commands/invoice_project.py | 2 -- cl/corpus_importer/management/commands/jackson_project.py | 2 -- cl/corpus_importer/management/commands/kessler_ilnb.py | 5 ----- cl/corpus_importer/management/commands/legal_robot.py | 2 -- cl/corpus_importer/management/commands/nos_700.py | 2 -- cl/corpus_importer/management/commands/nywb_chapter_7.py | 2 -- .../management/commands/scrape_pacer_free_opinions.py | 4 +--- cl/corpus_importer/management/commands/troller_bk.py | 3 --- 16 files changed, 1 insertion(+), 44 deletions(-) diff --git a/cl/corpus_importer/management/commands/760_project.py b/cl/corpus_importer/management/commands/760_project.py index b31c3a810c..90ecf391b7 100644 --- a/cl/corpus_importer/management/commands/760_project.py +++ b/cl/corpus_importer/management/commands/760_project.py @@ -15,7 +15,6 @@ from cl.lib.command_utils import VerboseCommand, logger from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Court, RECAPDocument -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -68,7 +67,6 @@ def get_dockets(options): "show_caption": True, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() elif task == "district": chain( @@ -89,7 +87,6 @@ def get_dockets(options): "show_list_of_member_cases": True, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/adelman_david.py b/cl/corpus_importer/management/commands/adelman_david.py index 25aa72db2f..4e0e8f72b1 100644 --- a/cl/corpus_importer/management/commands/adelman_david.py +++ b/cl/corpus_importer/management/commands/adelman_david.py @@ -13,7 +13,6 @@ from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import CommandUtils, VerboseCommand, logger from cl.lib.pacer_session import ProxyPacerSession, SessionData -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -64,7 +63,6 @@ def download_dockets(options): "show_caption": True, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: chain( @@ -93,7 +91,6 @@ def download_dockets(options): "show_list_of_member_cases": True, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() f.close() diff --git a/cl/corpus_importer/management/commands/anon_2020_import.py b/cl/corpus_importer/management/commands/anon_2020_import.py index 55191317e5..e6c3ad6ce3 100644 --- a/cl/corpus_importer/management/commands/anon_2020_import.py +++ b/cl/corpus_importer/management/commands/anon_2020_import.py @@ -17,7 +17,6 @@ from cl.lib.command_utils import VerboseCommand, logger from cl.lib.string_utils import trunc from cl.search.models import SOURCES, Citation, Docket, Opinion, OpinionCluster -from cl.search.tasks import add_items_to_solr HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") @@ -428,9 +427,6 @@ def import_anon_2020_db( court_id, ) - if make_searchable and docket: - add_items_to_solr.delay([docket.pk], "search.Docket") - class Command(VerboseCommand): help = "Import anon 2020 DB." diff --git a/cl/corpus_importer/management/commands/buchwald_project.py b/cl/corpus_importer/management/commands/buchwald_project.py index ba10538152..3bb5f2f292 100644 --- a/cl/corpus_importer/management/commands/buchwald_project.py +++ b/cl/corpus_importer/management/commands/buchwald_project.py @@ -15,7 +15,6 @@ from cl.lib.command_utils import VerboseCommand, logger from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Docket -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -115,7 +114,6 @@ def get_dockets(options): "show_list_of_member_cases": False, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/buried_alive_project.py b/cl/corpus_importer/management/commands/buried_alive_project.py index d81a4d2185..972ed405f0 100644 --- a/cl/corpus_importer/management/commands/buried_alive_project.py +++ b/cl/corpus_importer/management/commands/buried_alive_project.py @@ -11,7 +11,6 @@ from cl.lib.scorched_utils import ExtraSolrInterface from cl.lib.search_utils import build_main_query_from_query_string from cl.search.models import Docket -from cl.search.tasks import add_or_update_recap_docket # Do not order by score! QUERY_STRING = "q=entry_date_filed%3A%5B2018-05-01T00%3A00%3A00Z+TO+*%5D&type=r&order_by=dateFiled+asc&description=%22Vacat*%22+AND+2255+AND+%22Granted%22+NOT+%22Denied%22+NOT+%22Dismiss*%22&court=dcd+almd+alnd+alsd+akd+azd+ared+arwd+cacd+caed+cand+casd+cod+ctd+ded+flmd+flnd+flsd+gamd+gand+gasd+hid+idd+ilcd+ilnd+ilsd+innd+insd+iand+iasd+ksd+kyed+kywd+laed+lamd+lawd+med+mdd+mad+mied+miwd+mnd+msnd+mssd+moed+mowd+mtd+ned+nvd+nhd+njd+nmd+nyed+nynd+nysd+nywd+nced+ncmd+ncwd+ndd+ohnd+ohsd+oked+oknd+okwd+ord+paed+pamd+pawd+rid+scd+sdd+tned+tnmd+tnwd+txed+txnd+txsd+txwd+utd+vtd+vaed+vawd+waed+wawd+wvnd+wvsd+wied+wiwd+wyd+gud+nmid+prd+vid" @@ -75,7 +74,6 @@ def get_pacer_dockets(options, docket_pks, tags): "show_list_of_member_cases": False, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/everything_project.py b/cl/corpus_importer/management/commands/everything_project.py index b48dd4a008..bbe84f8daf 100644 --- a/cl/corpus_importer/management/commands/everything_project.py +++ b/cl/corpus_importer/management/commands/everything_project.py @@ -31,7 +31,6 @@ SOCIAL_SECURITY, ) from cl.recap.models import FjcIntegratedDatabase -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -157,7 +156,6 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): "doc_num_end": doc_num_end, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/harvard_opinions.py b/cl/corpus_importer/management/commands/harvard_opinions.py index a3206f8639..01a9454ea1 100644 --- a/cl/corpus_importer/management/commands/harvard_opinions.py +++ b/cl/corpus_importer/management/commands/harvard_opinions.py @@ -32,7 +32,6 @@ from cl.people_db.lookup_utils import extract_judge_last_name from cl.scrapers.utils import update_or_create_docket from cl.search.models import SOURCES, Court, Docket, Opinion, OpinionCluster -from cl.search.tasks import add_items_to_solr HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") @@ -549,9 +548,6 @@ def add_new_case( ) new_op_pks = add_opinions(soup, cluster.id, citation) - if make_searchable: - add_items_to_solr.delay(new_op_pks, "search.Opinion") - logger.info("Finished: %s", citation.corrected_citation()) logger.info( f"Finished adding case at https://www.courtlistener.com/opinion/{cluster.id}/{cluster.slug}" diff --git a/cl/corpus_importer/management/commands/import_patent.py b/cl/corpus_importer/management/commands/import_patent.py index b6956f0406..045654a0d6 100644 --- a/cl/corpus_importer/management/commands/import_patent.py +++ b/cl/corpus_importer/management/commands/import_patent.py @@ -15,7 +15,6 @@ from cl.recap.constants import PATENT, PATENT_ANDA from cl.recap.models import FjcIntegratedDatabase from cl.search.models import Docket -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -92,7 +91,6 @@ def get_dockets(options: dict) -> None: "doc_num_end": "", # No end doc num }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() else: d = dockets[0] @@ -110,7 +108,6 @@ def get_dockets(options: dict) -> None: "show_list_of_member_cases": False, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/invoice_project.py b/cl/corpus_importer/management/commands/invoice_project.py index 1a48d80f25..f1a59b4e68 100644 --- a/cl/corpus_importer/management/commands/invoice_project.py +++ b/cl/corpus_importer/management/commands/invoice_project.py @@ -20,7 +20,6 @@ from cl.recap.tasks import process_recap_attachment from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import RECAPDocument -from cl.search.tasks import add_items_to_solr PACER_USERNAME = os.environ["PACER_USERNAME"] PACER_PASSWORD = os.environ["PACER_PASSWORD"] @@ -157,7 +156,6 @@ def get_documents(options): tag=TAG_PHASE_2, ).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), - add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() i += 1 diff --git a/cl/corpus_importer/management/commands/jackson_project.py b/cl/corpus_importer/management/commands/jackson_project.py index d5afc22f02..97ca50d29b 100644 --- a/cl/corpus_importer/management/commands/jackson_project.py +++ b/cl/corpus_importer/management/commands/jackson_project.py @@ -8,7 +8,6 @@ from cl.lib.command_utils import VerboseCommand, logger from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Docket -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -52,7 +51,6 @@ def get_dockets(options): "show_list_of_member_cases": False, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/kessler_ilnb.py b/cl/corpus_importer/management/commands/kessler_ilnb.py index 2c16d3c5d2..9a84767722 100644 --- a/cl/corpus_importer/management/commands/kessler_ilnb.py +++ b/cl/corpus_importer/management/commands/kessler_ilnb.py @@ -19,7 +19,6 @@ from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import DocketEntry, RECAPDocument -from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -77,7 +76,6 @@ def get_dockets(options): "show_list_of_member_cases": True, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() @@ -128,9 +126,6 @@ def get_final_docs(options): tag=TAG_FINALS, ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), - add_items_to_solr.si([rd_pk], "search.RECAPDocument").set( - queue=q - ), ).apply_async() diff --git a/cl/corpus_importer/management/commands/legal_robot.py b/cl/corpus_importer/management/commands/legal_robot.py index c435e5780b..8c9ec4ab8e 100644 --- a/cl/corpus_importer/management/commands/legal_robot.py +++ b/cl/corpus_importer/management/commands/legal_robot.py @@ -12,7 +12,6 @@ from cl.lib.search_utils import build_main_query_from_query_string from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import RECAPDocument -from cl.search.tasks import add_items_to_solr PACER_USERNAME = os.environ["PACER_USERNAME"] PACER_PASSWORD = os.environ["PACER_PASSWORD"] @@ -85,7 +84,6 @@ def get_documents(options): tag=TAG, ).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), - add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/nos_700.py b/cl/corpus_importer/management/commands/nos_700.py index b95c663891..6d383ffe95 100644 --- a/cl/corpus_importer/management/commands/nos_700.py +++ b/cl/corpus_importer/management/commands/nos_700.py @@ -77,7 +77,6 @@ ) from cl.recap.models import FjcIntegratedDatabase from cl.search.models import RECAPDocument -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) @@ -272,7 +271,6 @@ def get_dockets(options, items, tags, sample_size=0): "show_list_of_member_cases": True, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/nywb_chapter_7.py b/cl/corpus_importer/management/commands/nywb_chapter_7.py index 72aaa914c7..9fa791a63c 100644 --- a/cl/corpus_importer/management/commands/nywb_chapter_7.py +++ b/cl/corpus_importer/management/commands/nywb_chapter_7.py @@ -15,7 +15,6 @@ from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger from cl.lib.pacer_session import ProxyPacerSession, SessionData -from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", "UNKNOWN!") PACER_PASSWORD = os.environ.get("PACER_PASSWORD", "UNKNOWN!") @@ -75,7 +74,6 @@ def get_dockets(options): "show_list_of_member_cases": False, }, ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index 08b2de837d..5d7e28c94a 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -32,7 +32,7 @@ from cl.scrapers.models import PACERFreeDocumentLog, PACERFreeDocumentRow from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import Court, RECAPDocument -from cl.search.tasks import add_docket_to_solr_by_rds, add_items_to_solr +from cl.search.tasks import add_docket_to_solr_by_rds def get_last_complete_date( @@ -351,8 +351,6 @@ def get_pdfs( delete_pacer_row.s(row.pk).set(queue=q), ) - if index: - c = c | add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: diff --git a/cl/corpus_importer/management/commands/troller_bk.py b/cl/corpus_importer/management/commands/troller_bk.py index 6da419151a..054d9e4682 100644 --- a/cl/corpus_importer/management/commands/troller_bk.py +++ b/cl/corpus_importer/management/commands/troller_bk.py @@ -41,7 +41,6 @@ is_cached, ) from cl.search.models import Court, Docket, DocketEntry, RECAPDocument -from cl.search.tasks import add_items_to_solr FILES_BUFFER_THRESHOLD = 3 @@ -611,8 +610,6 @@ def iterate_and_import_files( feed_data, court_id, build_date ) - add_items_to_solr.delay(rds_for_solr, "search.RECAPDocument") - total_dockets_created += dockets_created total_rds_created += len(rds_for_solr) From b210e9389269d79fcad1084dd4d5ffc874c00423 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:33:17 -0400 Subject: [PATCH 06/67] feat(corpus_importer): Removes Solr indexing logic from utils --- cl/corpus_importer/bulk_utils.py | 2 -- cl/corpus_importer/task_canvases.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/cl/corpus_importer/bulk_utils.py b/cl/corpus_importer/bulk_utils.py index 66e45fdc86..230a62d6e8 100644 --- a/cl/corpus_importer/bulk_utils.py +++ b/cl/corpus_importer/bulk_utils.py @@ -11,7 +11,6 @@ from cl.lib.search_utils import build_main_query_from_query_string from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import RECAPDocument -from cl.search.tasks import add_items_to_solr def docket_pks_for_query(query_string): @@ -101,5 +100,4 @@ def get_petitions( tag=tag_petitions, ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), - add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/task_canvases.py b/cl/corpus_importer/task_canvases.py index 01ace71b32..58da086029 100644 --- a/cl/corpus_importer/task_canvases.py +++ b/cl/corpus_importer/task_canvases.py @@ -15,7 +15,6 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.recap.tasks import process_recap_attachment -from cl.search.tasks import add_or_update_recap_docket def get_docket_and_claims( @@ -46,7 +45,6 @@ def get_docket_and_claims( get_bankr_claims_registry.s( session_data=cookies_data, tag_names=tags ).set(queue=q), - add_or_update_recap_docket.s().set(queue=q), ).apply_async() From 6a8e2c9087a7232108bea6fe2c26f16fa7e76158 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:33:35 -0400 Subject: [PATCH 07/67] feat(recap): Removes Solr indexing logic from commands --- cl/recap/management/commands/reprocess_recap_dockets.py | 2 -- cl/recap_rss/management/commands/scrape_rss.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/cl/recap/management/commands/reprocess_recap_dockets.py b/cl/recap/management/commands/reprocess_recap_dockets.py index 3f19a29193..87d12ab97f 100644 --- a/cl/recap/management/commands/reprocess_recap_dockets.py +++ b/cl/recap/management/commands/reprocess_recap_dockets.py @@ -9,7 +9,6 @@ from cl.lib.command_utils import VerboseCommand from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import Docket, RECAPDocument -from cl.search.tasks import add_items_to_solr def extract_unextracted_rds_and_add_to_solr(queue: str) -> None: @@ -44,7 +43,6 @@ def extract_unextracted_rds_and_add_to_solr(queue: str) -> None: throttle.maybe_wait() chain( extract_recap_pdf.si(chunk).set(queue=queue), - add_items_to_solr.s("search.RECAPDocument").set(queue=queue), ).apply_async() chunk = [] sys.stdout.write( diff --git a/cl/recap_rss/management/commands/scrape_rss.py b/cl/recap_rss/management/commands/scrape_rss.py index 69bbdbc257..1140080aa5 100644 --- a/cl/recap_rss/management/commands/scrape_rss.py +++ b/cl/recap_rss/management/commands/scrape_rss.py @@ -16,7 +16,6 @@ trim_rss_data, ) from cl.search.models import Court -from cl.search.tasks import add_items_to_solr class Command(VerboseCommand): @@ -155,7 +154,6 @@ def handle(self, *args, **options): # docket information from the RSS feeds. RSS feeds also # have information about hundreds or thousands of # dockets. Updating them all would be very bad. - add_items_to_solr.s("search.RECAPDocument"), mark_status_successful.si(new_status.pk), ).apply_async() From 61c921338815e96f977deb4fa8ce0709ecae3dcf Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:33:51 -0400 Subject: [PATCH 08/67] feat(recap): Removes Solr indexing logic from mergers --- cl/recap/mergers.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 7eb9866e1d..6fd3f6bf4a 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -56,7 +56,7 @@ RECAPDocument, Tag, ) -from cl.search.tasks import add_items_to_solr, index_docket_parties_in_es +from cl.search.tasks import index_docket_parties_in_es logger = logging.getLogger(__name__) @@ -1903,9 +1903,6 @@ async def merge_attachment_page_data( pass await rd.asave() - # Do *not* do this async — that can cause race conditions. - await sync_to_async(add_items_to_solr)([rd.pk], "search.RECAPDocument") - if not is_acms_attachment: await clean_duplicate_attachment_entries(de, attachment_dicts) await mark_ia_upload_needed(de.docket, save_docket=True) @@ -1951,8 +1948,6 @@ def save_iquery_to_docket( raise self.retry(exc=exc) async_to_sync(add_tags_to_objs)(tag_names, [d]) - if add_to_solr: - add_items_to_solr([d.pk], "search.Docket") logger.info(f"Created/updated docket: {d}") # Add the CASE_QUERY_PAGE to the docket in case we need it someday. @@ -2042,7 +2037,6 @@ def process_case_query_report( d.avoid_trigger_signal = avoid_trigger_signal d.save() add_bankruptcy_data_to_docket(d, report_data) - add_items_to_solr([d.pk], "search.Docket") logger.info( f"Created/updated docket: {d} from court: {court_id} and pacer_case_id {pacer_case_id}" ) From 29d5b4465966ba998932d6840882b9ba70a2e906 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 10:34:12 -0400 Subject: [PATCH 09/67] feat(tasks): Remove Solr indexing logic from task execution --- cl/citations/tasks.py | 12 +-- cl/corpus_importer/tasks.py | 6 -- cl/recap/tasks.py | 14 +--- cl/search/tasks.py | 145 ------------------------------------ 4 files changed, 2 insertions(+), 175 deletions(-) diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py index 335af6a04f..189f2a3bbf 100644 --- a/cl/citations/tasks.py +++ b/cl/citations/tasks.py @@ -32,7 +32,7 @@ Parenthetical, RECAPDocument, ) -from cl.search.tasks import add_items_to_solr, index_related_cites_fields +from cl.search.tasks import index_related_cites_fields # This is the distance two reporter abbreviations can be from each other if # they are considered parallel reporters. For example, @@ -128,11 +128,6 @@ def find_citations_and_parentheticals_for_opinion_by_pks( # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) - # If a Solr update was requested, do a single one at the end with all the - # pks of the passed opinions - if index: - add_items_to_solr.delay(opinion_pks, "search.Opinion") - def store_opinion_citations_and_update_parentheticals( opinion: Opinion, index: bool @@ -224,11 +219,6 @@ def store_opinion_citations_and_update_parentheticals( citation_count=F("citation_count") + 1 ) - if index: - add_items_to_solr.delay( - opinion_clusters_to_update.values_list("pk", flat=True), - "search.OpinionCluster", - ) # Nuke existing citations and parentheticals OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() Parenthetical.objects.filter(describing_opinion_id=opinion.pk).delete() diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index bfa21e43b5..fdea523675 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -127,7 +127,6 @@ RECAPDocument, Tag, ) -from cl.search.tasks import add_items_to_solr HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") @@ -2441,7 +2440,6 @@ def get_pacer_doc_by_rd_and_description( # Skip OCR for now. It'll happen in a second step. async_to_sync(extract_recap_pdf_base)(rd.pk, ocr_available=False) - add_items_to_solr([rd.pk], "search.RECAPDocument") @app.task( @@ -2835,10 +2833,6 @@ def recap_document_into_opinions( extracted_by_ocr=r["extracted_by_ocr"], ) - if add_to_solr: - # Add opinions to solr - add_items_to_solr.delay([opinion.id], "search.Opinion") - logger.info( "Successfully imported https://www.courtlistener.com/opinion/{}/decision/".format( cluster.id diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index aad67f138f..d4780763d4 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -95,11 +95,7 @@ ) from cl.scrapers.tasks import extract_recap_pdf, extract_recap_pdf_base from cl.search.models import Court, Docket, DocketEntry, RECAPDocument -from cl.search.tasks import ( - add_items_to_solr, - add_or_update_recap_docket, - index_docket_parties_in_es, -) +from cl.search.tasks import index_docket_parties_in_es logger = logging.getLogger(__name__) cnt = CaseNameTweaker() @@ -113,17 +109,14 @@ async def process_recap_upload(pq: ProcessingQueue) -> None: """ if pq.upload_type == UPLOAD_TYPE.DOCKET: docket = await process_recap_docket(pq.pk) - await sync_to_async(add_or_update_recap_docket.delay)(docket) elif pq.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE: await process_recap_attachment(pq.pk) elif pq.upload_type == UPLOAD_TYPE.PDF: await process_recap_pdf(pq.pk) elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: docket = await process_recap_docket_history_report(pq.pk) - await sync_to_async(add_or_update_recap_docket.delay)(docket) elif pq.upload_type == UPLOAD_TYPE.APPELLATE_DOCKET: docket = await process_recap_appellate_docket(pq.pk) - await sync_to_async(add_or_update_recap_docket.delay)(docket) elif pq.upload_type == UPLOAD_TYPE.APPELLATE_ATTACHMENT_PAGE: await process_recap_appellate_attachment(pq.pk) elif pq.upload_type == UPLOAD_TYPE.CLAIMS_REGISTER: @@ -132,7 +125,6 @@ async def process_recap_upload(pq: ProcessingQueue) -> None: await process_recap_zip(pq.pk) elif pq.upload_type == UPLOAD_TYPE.CASE_QUERY_PAGE: docket = await process_case_query_page(pq.pk) - await sync_to_async(add_or_update_recap_docket.delay)(docket) elif pq.upload_type == UPLOAD_TYPE.APPELLATE_CASE_QUERY_PAGE: await sync_to_async(process_recap_appellate_case_query_page)(pq.pk) elif pq.upload_type == UPLOAD_TYPE.CASE_QUERY_RESULT_PAGE: @@ -158,7 +150,6 @@ def do_pacer_fetch(fq: PacerFetchQueue): # Request by docket_id c = chain( fetch_docket.si(fq.pk), - add_or_update_recap_docket.s(), mark_fq_successful.si(fq.pk), ) result = c.apply_async() @@ -168,7 +159,6 @@ def do_pacer_fetch(fq: PacerFetchQueue): result = chain( fetch_pacer_doc_by_rd.si(rd_pk, fq.pk), extract_recap_pdf.si(rd_pk), - add_items_to_solr.si([rd_pk], "search.RECAPDocument"), mark_fq_successful.si(fq.pk), ).apply_async() elif fq.request_type == REQUEST_TYPE.ATTACHMENT_PAGE: @@ -438,7 +428,6 @@ async def process_recap_pdf(pk): await sync_to_async( chain( extract_recap_pdf.si(rd.pk), - add_items_to_solr.s("search.RECAPDocument"), ).apply_async )() @@ -2791,5 +2780,4 @@ def do_recap_document_fetch(epq: EmailProcessingQueue, user: User) -> None: return chain( process_recap_email.si(epq.pk, user.pk), extract_recap_pdf.s(), - add_items_to_solr.s("search.RECAPDocument"), ).apply_async() diff --git a/cl/search/tasks.py b/cl/search/tasks.py index e2522036bf..deccc77fa9 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -83,151 +83,6 @@ es_document_module = import_module("cl.search.documents") -@app.task -def add_items_to_solr(item_pks, app_label, force_commit=False): - """Add a list of items to Solr - - :param item_pks: An iterable list of item PKs that you wish to add to Solr. - :param app_label: The type of item that you are adding. - :param force_commit: Whether to send a commit to Solr after your addition. - This is generally not advised and is mostly used for testing. - """ - search_dicts = [] - model = apps.get_model(app_label) - items = model.objects.filter(pk__in=item_pks).order_by() - for item in items: - try: - if model in [OpinionCluster, Docket]: - # Dockets make a list of items; extend, don't append - search_dicts.extend(item.as_search_list()) - else: - search_dicts.append(item.as_search_dict()) - except AttributeError as e: - print(f"AttributeError trying to add: {item}\n {e}") - except ValueError as e: - print(f"ValueError trying to add: {item}\n {e}") - except InvalidDocumentError: - print(f"Unable to parse: {item}") - - with Session() as session: - si = scorched.SolrInterface( - settings.SOLR_URLS[app_label], http_connection=session, mode="w" - ) - try: - si.add(search_dicts) - if force_commit: - si.commit() - except (socket.error, SolrError) as exc: - add_items_to_solr.retry(exc=exc, countdown=30) - else: - # Mark dockets as updated if needed - if model == Docket: - items.update(date_modified=now(), date_last_index=now()) - - -@app.task(ignore_resutls=True) -def add_or_update_recap_docket( - data, force_commit=False, update_threshold=60 * 60 -): - """Add an entire docket to Solr or update it if it's already there. - - This is an expensive operation because to add or update a RECAP docket in - Solr means updating every document that's a part of it. So if a docket has - 10,000 documents, we'll have to pull them *all* from the database, and - re-index them all. It'd be nice to not have to do this, but because Solr is - de-normalized, every document in the RECAP Solr index has a copy of every - field in Solr. For example, if the name of the case changes, that has to get - reflected in every document in the docket in Solr. - - To deal with this mess, we have a field on the docket that says when we last - updated it in Solr. If that date is after a threshold, we just don't do the - update unless we know the docket has something new. - - :param data: A dictionary containing the a key for 'docket_pk' and - 'content_updated'. 'docket_pk' will be used to find the docket to modify. - 'content_updated' is a boolean indicating whether the docket must be - updated. - :param force_commit: Whether to send a commit to Solr (this is usually not - needed). - :param update_threshold: Items staler than this number of seconds will be - updated. Items fresher than this number will be a no-op. - """ - if data is None: - return - - with Session() as session: - si = scorched.SolrInterface( - settings.SOLR_RECAP_URL, http_connection=session, mode="w" - ) - some_time_ago = now() - timedelta(seconds=update_threshold) - d = Docket.objects.get(pk=data["docket_pk"]) - too_fresh = d.date_last_index is not None and ( - d.date_last_index > some_time_ago - ) - update_not_required = not data.get("content_updated", False) - if all([too_fresh, update_not_required]): - return - else: - try: - si.add(d.as_search_list()) - if force_commit: - si.commit() - except SolrError as exc: - add_or_update_recap_docket.retry(exc=exc, countdown=30) - else: - d.date_last_index = now() - d.save() - - -@app.task -def add_docket_to_solr_by_rds(item_pks, force_commit=False): - """Add RECAPDocuments from a single Docket to Solr. - - This is a performance enhancement that can be used when adding many RECAP - Documents from a single docket to Solr. Instead of pulling the same docket - metadata for these items over and over (adding potentially thousands of - queries on a large docket), just pull the metadata once and cache it for - every document that's added. - - :param item_pks: RECAPDocument pks to add or update in Solr. - :param force_commit: Whether to send a commit to Solr (this is usually not - needed). - :return: None - """ - with Session() as session: - si = scorched.SolrInterface( - settings.SOLR_RECAP_URL, http_connection=session, mode="w" - ) - rds = RECAPDocument.objects.filter(pk__in=item_pks).order_by() - try: - metadata = rds[0].get_docket_metadata() - except IndexError: - metadata = None - - try: - si.add( - [item.as_search_dict(docket_metadata=metadata) for item in rds] - ) - if force_commit: - si.commit() - except SolrError as exc: - add_docket_to_solr_by_rds.retry(exc=exc, countdown=30) - - -@app.task -def delete_items(items, app_label, force_commit=False): - with Session() as session: - si = scorched.SolrInterface( - settings.SOLR_URLS[app_label], http_connection=session, mode="w" - ) - try: - si.delete_by_ids(list(items)) - if force_commit: - si.commit() - except SolrError as exc: - delete_items.retry(exc=exc, countdown=30) - - def person_first_time_indexing(parent_id: int, position: Position) -> None: """Index a person and their no judiciary positions into Elasticsearch. From 7f9664b1b51e303754926d6c50005be5f7b86e96 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 5 Dec 2024 11:29:37 -0400 Subject: [PATCH 10/67] feat(stats): Removes Solr health check --- cl/stats/utils.py | 11 ----------- cl/stats/views.py | 3 --- 2 files changed, 14 deletions(-) diff --git a/cl/stats/utils.py b/cl/stats/utils.py index e8dd59f1cf..41dc88e0cf 100644 --- a/cl/stats/utils.py +++ b/cl/stats/utils.py @@ -91,17 +91,6 @@ def check_postgresql() -> bool: return True -def check_solr() -> bool: - """Check if we can connect to Solr""" - s = requests.Session() - for domain in {settings.SOLR_HOST, settings.SOLR_RECAP_HOST}: - try: - s.get(f"{domain}/solr/admin/ping?wt=json", timeout=2) - except ConnectionError: - return False - return True - - def get_replication_statuses() -> dict[str, list[dict[str, str | int]]]: """Return the replication status information for all publishers diff --git a/cl/stats/views.py b/cl/stats/views.py index 2dd0581dac..56ef4e5ff9 100644 --- a/cl/stats/views.py +++ b/cl/stats/views.py @@ -8,7 +8,6 @@ from cl.stats.utils import ( check_postgresql, check_redis, - check_solr, get_replication_statuses, ) @@ -17,7 +16,6 @@ def health_check(request: HttpRequest) -> JsonResponse: """Check if we can connect to various services.""" is_redis_up = check_redis() is_postgresql_up = check_postgresql() - is_solr_up = check_solr() status = HTTPStatus.OK if not all([is_redis_up, is_postgresql_up, is_solr_up]): @@ -25,7 +23,6 @@ def health_check(request: HttpRequest) -> JsonResponse: return JsonResponse( { - "is_solr_up": is_solr_up, "is_postgresql_up": is_postgresql_up, "is_redis_up": is_redis_up, }, From 2fe31930ee4a965fd2ade43248de2c1c87604afe Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 17:42:24 -0400 Subject: [PATCH 11/67] feat(settings): Clean up SOLR env vars --- cl/settings/project/search.py | 44 ----------------------------------- 1 file changed, 44 deletions(-) diff --git a/cl/settings/project/search.py b/cl/settings/project/search.py index e580108dee..cd44ec9534 100644 --- a/cl/settings/project/search.py +++ b/cl/settings/project/search.py @@ -4,50 +4,6 @@ env = environ.FileAwareEnv() -SOLR_HOST = env("SOLR_HOST", default="http://cl-solr:8983") -SOLR_RECAP_HOST = env("SOLR_RECAP_HOST", default="http://cl-solr:8983") -SOLR_PAGERANK_DEST_DIR = env("SOLR_PAGERANK_DEST_DIR", default="/tmp/") - -######## -# Solr # -######## -SOLR_OPINION_URL = f"{SOLR_HOST}/solr/collection1" -SOLR_AUDIO_URL = f"{SOLR_HOST}/solr/audio" -SOLR_PEOPLE_URL = f"{SOLR_HOST}/solr/person" -SOLR_RECAP_URL = f"{SOLR_RECAP_HOST}/solr/recap" -SOLR_URLS = { - "audio.Audio": SOLR_AUDIO_URL, - "people_db.Person": SOLR_PEOPLE_URL, - "search.Docket": SOLR_RECAP_URL, - "search.RECAPDocument": SOLR_RECAP_URL, - "search.Opinion": SOLR_OPINION_URL, - "search.OpinionCluster": SOLR_OPINION_URL, -} - -SOLR_OPINION_TEST_CORE_NAME = "opinion_test" -SOLR_AUDIO_TEST_CORE_NAME = "audio_test" -SOLR_PEOPLE_TEST_CORE_NAME = "person_test" -SOLR_RECAP_TEST_CORE_NAME = "recap_test" - -SOLR_OPINION_TEST_URL = f"{SOLR_HOST}/solr/opinion_test" -SOLR_AUDIO_TEST_URL = f"{SOLR_HOST}/solr/audio_test" -SOLR_PEOPLE_TEST_URL = f"{SOLR_HOST}/solr/person_test" -SOLR_RECAP_TEST_URL = f"{SOLR_RECAP_HOST}/solr/recap_test" -SOLR_TEST_URLS = { - "audio.Audio": SOLR_AUDIO_TEST_URL, - "people_db.Person": SOLR_PEOPLE_TEST_URL, - "search.Docket": SOLR_RECAP_TEST_URL, - "search.RECAPDocument": SOLR_RECAP_TEST_URL, - "search.Opinion": SOLR_OPINION_TEST_URL, - "search.OpinionCluster": SOLR_OPINION_TEST_URL, -} -SOLR_EXAMPLE_CORE_PATH = os.path.join( - os.sep, "usr", "local", "solr", "example", "solr", "collection1" -) -SOLR_TEMP_CORE_PATH_LOCAL = os.path.join(os.sep, "tmp", "solr") -SOLR_TEMP_CORE_PATH_DOCKER = os.path.join(os.sep, "tmp", "solr") - - ################### # Related content # ################### From 33f418137ecef9c17dc1917f89f1c7fee0815f06 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 17:45:59 -0400 Subject: [PATCH 12/67] build(deps): Removes scorched --- poetry.lock | 25 +------------------------ pyproject.toml | 1 - 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/poetry.lock b/poetry.lock index d1cd5764ce..313f784276 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4435,29 +4435,6 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodest doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.13.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<=7.3.7)", "sphinx-design (>=0.4.0)"] test = ["Cython", "array-api-strict (>=2.0)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] -[[package]] -name = "scorched" -version = "0.13.1.dev0" -description = "" -optional = false -python-versions = "*" -files = [] -develop = false - -[package.dependencies] -pytz = "*" -requests = "*" -setuptools = "*" - -[package.extras] -test = ["coverage", "mock", "nose"] - -[package.source] -type = "git" -url = "https://github.com/freelawproject/scorched.git" -reference = "main" -resolved_reference = "0632024e72e22a71e17cdb778805561f7cdd33d8" - [[package]] name = "seal-rookery" version = "2.2.5" @@ -5713,4 +5690,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.13, <3.14" -content-hash = "35fd59ce49427641e8af1c9af7bc587e4bd35d8b6ecb2af5e6b18a058e19ecd5" +content-hash = "638849e8b93312af48bcd4bae74b198fde89a59a666e21b546ada9dae6a656c4" diff --git a/pyproject.toml b/pyproject.toml index 5406c05f3d..03f2c0b45c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,6 @@ simplejson = "^3.19.3" timeout-decorator = "*" unidecode = "*" usaddress = "^0.5.11" -scorched = {git = "https://github.com/freelawproject/scorched.git", branch="main"} djangorestframework-filters = "1.0.0.dev2" gunicorn = "^23.0.0" django-hCaptcha = "^0.2.0" From 34566dbc87741da7085b27cdcf171cab38aaf201 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 17:47:25 -0400 Subject: [PATCH 13/67] feat(corpus_importer): Removes unused import --- .../management/commands/scrape_pacer_free_opinions.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index 5d7e28c94a..4fd020e42b 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -32,7 +32,6 @@ from cl.scrapers.models import PACERFreeDocumentLog, PACERFreeDocumentRow from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import Court, RECAPDocument -from cl.search.tasks import add_docket_to_solr_by_rds def get_last_complete_date( @@ -381,9 +380,8 @@ def ocr_available(queue: str, index: bool) -> None: queue=q ).apply_async() else: - chain( - extract_recap_pdf.si(pk, ocr_available=True).set(queue=q), - add_docket_to_solr_by_rds.s().set(queue=q), + extract_recap_pdf.si(pk, ocr_available=True).set( + queue=q ).apply_async() if i % 1000 == 0: logger.info(f"Sent {i + 1}/{count} tasks to celery so far.") From abfc40b6797024c17140810548ab65f181bce074 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:07:01 -0400 Subject: [PATCH 14/67] feat(recap): Remove add_to_solr argument from save_iquery_to_docket --- cl/corpus_importer/tasks.py | 1 - cl/recap/mergers.py | 2 -- cl/scrapers/tasks.py | 1 - 3 files changed, 4 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index fdea523675..d0760cb2e9 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -1299,7 +1299,6 @@ def make_docket_by_iquery_base( report_text, d, tag_names, - add_to_solr=True, avoid_trigger_signal=avoid_trigger_signal, ) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 6fd3f6bf4a..158bb9643a 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1918,7 +1918,6 @@ def save_iquery_to_docket( iquery_text: str, d: Docket, tag_names: Optional[List[str]], - add_to_solr: bool = False, avoid_trigger_signal: bool = False, ) -> Optional[int]: """Merge iquery results into a docket @@ -1928,7 +1927,6 @@ def save_iquery_to_docket( :param iquery_text: The HTML text data from a successful iquery response :param d: A docket object to work with :param tag_names: Tags to add to the items - :param add_to_solr: Whether to save the completed docket to solr :param avoid_trigger_signal: Whether to avoid triggering the iquery sweep signal. Useful for ignoring reports added by the probe daemon or the iquery sweep itself. diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 7bbc8bb40b..b45dcd922e 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -448,5 +448,4 @@ def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: report.response.text, d, tag_names=None, - add_to_solr=True, ) From 31f73fc222fc945d5794303ecf05aac7b0d4e822 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:23:20 -0400 Subject: [PATCH 15/67] feat(audio): Simplifies model by removing custom save logic --- cl/audio/factories.py | 9 --------- cl/audio/models.py | 17 ----------------- .../management/commands/clone_from_cl.py | 3 +-- cl/stats/views.py | 2 +- 4 files changed, 2 insertions(+), 29 deletions(-) diff --git a/cl/audio/factories.py b/cl/audio/factories.py index 6c6d1c594c..af689715e0 100644 --- a/cl/audio/factories.py +++ b/cl/audio/factories.py @@ -16,15 +16,6 @@ class Meta: sha1 = Faker("sha1") download_url = Faker("url") - @classmethod - def _create(cls, model_class, *args, **kwargs): - """Creates an instance of the model class without indexing.""" - obj = model_class(*args, **kwargs) - # explicitly sets `index=False` to prevent it from being indexed in SOLR. - # Once Solr is removed, we can just remove this method completely. - obj.save(index=False) - return obj - """ These hooks are necessary to make this factory compatible with the `make_dev_command`. by delegating the file creation to the hooks, we prevent diff --git a/cl/audio/models.py b/cl/audio/models.py index 992c3a311f..0ea5438de2 100644 --- a/cl/audio/models.py +++ b/cl/audio/models.py @@ -196,23 +196,6 @@ def __str__(self) -> str: def get_absolute_url(self) -> str: return reverse("view_audio_file", args=[self.pk, self.docket.slug]) - def save( # type: ignore[override] - self, - index: bool = True, - force_commit: bool = False, - *args: List, - **kwargs: Dict, - ) -> None: - """ - Overrides the normal save method, but provides integration with the - bulk files and with Solr indexing. - - :param index: Should the item be added to the Solr index? - :param force_commit: Should a commit be performed in solr after - indexing it? - """ - super().save(*args, **kwargs) # type: ignore - def as_search_dict(self) -> Dict[str, Union[int, List[int], str]]: """Create a dict that can be ingested by Solr""" # IDs diff --git a/cl/scrapers/management/commands/clone_from_cl.py b/cl/scrapers/management/commands/clone_from_cl.py index f1311e2b59..5401011184 100644 --- a/cl/scrapers/management/commands/clone_from_cl.py +++ b/cl/scrapers/management/commands/clone_from_cl.py @@ -527,8 +527,7 @@ def clone_audio_files( audio.local_path_mp3.save(file_name, cf, save=False) with transaction.atomic(): - # Prevent solr from indexing the file - audio.save(index=False) + audio.save() print(f"Cloned audio with id {audio_id}") diff --git a/cl/stats/views.py b/cl/stats/views.py index 56ef4e5ff9..3bdd8e344c 100644 --- a/cl/stats/views.py +++ b/cl/stats/views.py @@ -18,7 +18,7 @@ def health_check(request: HttpRequest) -> JsonResponse: is_postgresql_up = check_postgresql() status = HTTPStatus.OK - if not all([is_redis_up, is_postgresql_up, is_solr_up]): + if not all([is_redis_up, is_postgresql_up]): status = HTTPStatus.INTERNAL_SERVER_ERROR return JsonResponse( From 37baaf48683ce97346b86b5ebcac44296601d765 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:46:54 -0400 Subject: [PATCH 16/67] feat(audio): Remove as_search_dict method --- cl/audio/models.py | 68 +--------------------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/cl/audio/models.py b/cl/audio/models.py index 0ea5438de2..7ec48d8a5b 100644 --- a/cl/audio/models.py +++ b/cl/audio/models.py @@ -2,21 +2,12 @@ import pghistory from django.db import models -from django.template import loader -from django.urls import NoReverseMatch, reverse +from django.urls import reverse from model_utils import FieldTracker -from cl.custom_filters.templatetags.text_filters import best_case_name -from cl.lib.date_time import midnight_pt from cl.lib.model_helpers import make_upload_path from cl.lib.models import AbstractDateTimeModel, s3_warning_note -from cl.lib.search_index_utils import ( - InvalidDocumentError, - normalize_search_dicts, - null_map, -) from cl.lib.storage import IncrementingAWSMediaStorage -from cl.lib.utils import deepgetattr from cl.people_db.models import Person from cl.search.models import SOURCES, Docket @@ -196,63 +187,6 @@ def __str__(self) -> str: def get_absolute_url(self) -> str: return reverse("view_audio_file", args=[self.pk, self.docket.slug]) - def as_search_dict(self) -> Dict[str, Union[int, List[int], str]]: - """Create a dict that can be ingested by Solr""" - # IDs - out = { - "id": self.pk, - "docket_id": self.docket_id, - "court_id": self.docket.court_id, - } - - # Docket - docket = {"docketNumber": self.docket.docket_number} - if self.docket.date_argued is not None: - docket["dateArgued"] = midnight_pt(self.docket.date_argued) - if self.docket.date_reargued is not None: - docket["dateReargued"] = midnight_pt(self.docket.date_reargued) - if self.docket.date_reargument_denied is not None: - docket["dateReargumentDenied"] = midnight_pt( - self.docket.date_reargument_denied - ) - out.update(docket) - - # Court - out.update( - { - "court": self.docket.court.full_name, - "court_citation_string": self.docket.court.citation_string, - "court_exact": self.docket.court_id, # For faceting - } - ) - - # Audio File - out.update( - { - "caseName": best_case_name(self), - "panel_ids": [judge.pk for judge in self.panel.all()], - "judge": self.judges, - "file_size_mp3": deepgetattr( - self, "local_path_mp3.size", None - ), - "duration": self.duration, - "source": self.source, - "download_url": self.download_url, - "local_path": deepgetattr(self, "local_path_mp3.name", None), - } - ) - try: - out["absolute_url"] = self.get_absolute_url() - except NoReverseMatch: - raise InvalidDocumentError( - f"Unable to save to index due to missing absolute_url: {self.pk}" - ) - - text_template = loader.get_template("indexes/audio_text.txt") - out["text"] = text_template.render({"item": self}).translate(null_map) - - return normalize_search_dicts(out) - @pghistory.track( pghistory.InsertEvent(), pghistory.DeleteEvent(), obj_field=None From fb7f45943390fe4f427d3ec6088aaa3baa65544e Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:47:08 -0400 Subject: [PATCH 17/67] feat(people_db): Remove as_search_dict method --- cl/people_db/models.py | 135 ----------------------------------------- 1 file changed, 135 deletions(-) diff --git a/cl/people_db/models.py b/cl/people_db/models.py index 25e9f9bec1..75c8346694 100644 --- a/cl/people_db/models.py +++ b/cl/people_db/models.py @@ -1,6 +1,5 @@ import pghistory from django.db import models -from django.template import loader from django.urls import reverse from django.utils.text import slugify from localflavor.us.models import ( @@ -11,7 +10,6 @@ from model_utils import FieldTracker from cl.custom_filters.templatetags.extras import granular_date -from cl.lib.date_time import midnight_pt from cl.lib.model_helpers import ( make_choices_group_lookup, validate_all_or_none, @@ -25,11 +23,6 @@ validate_supervisor, ) from cl.lib.models import AbstractDateTimeModel -from cl.lib.search_index_utils import ( - normalize_search_dicts, - null_map, - solr_list, -) from cl.lib.string_utils import trunc from cl.search.models import Court @@ -279,134 +272,6 @@ def is_judge(self) -> bool: position.is_judicial_position for position in self.positions.all() ) - def as_search_dict(self): - """Create a dict that can be ingested by Solr""" - out = { - "id": self.pk, - "fjc_id": self.fjc_id, - "cl_id": "none", # Deprecated, but required by Solr - "alias_ids": [alias.pk for alias in self.aliases.all()], - "races": [r.get_race_display() for r in self.race.all()], - "gender": self.get_gender_display(), - "religion": self.religion, - "name": self.name_full, - "name_reverse": self.name_full_reverse, - "date_granularity_dob": self.date_granularity_dob, - "date_granularity_dod": self.date_granularity_dod, - "dob_city": self.dob_city, - "dob_state": self.get_dob_state_display(), - "dob_state_id": self.dob_state, - "absolute_url": self.get_absolute_url(), - "school": [e.school.name for e in self.educations.all()], - "political_affiliation": [ - pa.get_political_party_display() - for pa in self.political_affiliations.all() - if pa - ], - "political_affiliation_id": [ - pa.political_party - for pa in self.political_affiliations.all() - if pa - ], - "aba_rating": [ - r.get_rating_display() for r in self.aba_ratings.all() if r - ], - } - - # Dates - if self.date_dob is not None: - out["dob"] = midnight_pt(self.date_dob) - if self.date_dod is not None: - out["dod"] = midnight_pt(self.date_dod) - - # Joined Values. Brace yourself. - positions = self.positions.all() - if positions.count() > 0: - p_out = { - "court": [p.court.short_name for p in positions if p.court], - "court_exact": [p.court.pk for p in positions if p.court], - "position_type": [ - p.get_position_type_display() for p in positions - ], - "appointer": [ - p.appointer.person.name_full_reverse - for p in positions - if p.appointer - ], - "supervisor": [ - p.supervisor.name_full_reverse - for p in positions - if p.supervisor - ], - "predecessor": [ - p.predecessor.name_full_reverse - for p in positions - if p.predecessor - ], - "date_nominated": solr_list(positions, "date_nominated"), - "date_elected": solr_list(positions, "date_elected"), - "date_recess_appointment": solr_list( - positions, - "date_recess_appointment", - ), - "date_referred_to_judicial_committee": solr_list( - positions, - "date_referred_to_judicial_committee", - ), - "date_judicial_committee_action": solr_list( - positions, - "date_judicial_committee_action", - ), - "date_hearing": solr_list(positions, "date_hearing"), - "date_confirmation": solr_list(positions, "date_confirmation"), - "date_start": solr_list(positions, "date_start"), - "date_granularity_start": solr_list( - positions, - "date_granularity_start", - ), - "date_retirement": solr_list( - positions, - "date_retirement", - ), - "date_termination": solr_list( - positions, - "date_termination", - ), - "date_granularity_termination": solr_list( - positions, - "date_granularity_termination", - ), - "judicial_committee_action": [ - p.get_judicial_committee_action_display() - for p in positions - if p.judicial_committee_action - ], - "nomination_process": [ - p.get_nomination_process_display() - for p in positions - if p.nomination_process - ], - "selection_method": [ - p.get_how_selected_display() - for p in positions - if p.how_selected - ], - "selection_method_id": [ - p.how_selected for p in positions if p.how_selected - ], - "termination_reason": [ - p.get_termination_reason_display() - for p in positions - if p.termination_reason - ], - } - out.update(p_out) - - text_template = loader.get_template("indexes/person_text.txt") - out["text"] = text_template.render({"item": self}).translate(null_map) - - return normalize_search_dicts(out) - @pghistory.track() class School(AbstractDateTimeModel): From 8b68c392e14de9d87b5b32794a5a1a31141b483e Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:47:20 -0400 Subject: [PATCH 18/67] feat(search): Removes as_search_dict method --- cl/search/models.py | 377 +------------------------------------------- 1 file changed, 1 insertion(+), 376 deletions(-) diff --git a/cl/search/models.py b/cl/search/models.py index 5a4eb4a6fc..c84cc0c907 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -14,7 +14,6 @@ from django.db import IntegrityError, models, transaction from django.db.models import Q, QuerySet from django.db.models.functions import MD5 -from django.template import loader from django.urls import NoReverseMatch, reverse from django.utils import timezone from django.utils.encoding import force_str @@ -38,14 +37,9 @@ make_upload_path, ) from cl.lib.models import AbstractDateTimeModel, AbstractPDF, s3_warning_note -from cl.lib.search_index_utils import ( - InvalidDocumentError, - normalize_search_dicts, - null_map, -) +from cl.lib.search_index_utils import InvalidDocumentError from cl.lib.storage import IncrementingAWSMediaStorage from cl.lib.string_utils import trunc -from cl.lib.utils import deepgetattr from cl.search.docket_sources import DocketSources from cl.users.models import User @@ -993,117 +987,6 @@ def pacer_status_url(self): def pacer_view_doc_url(self): return self.pacer_district_url("qryDocument.pl") - def as_search_list(self): - """Create list of search dicts from a single docket. This should be - faster than creating a search dict per document on the docket. - """ - search_list = [] - - # Docket - out = { - "docketNumber": self.docket_number, - "caseName": best_case_name(self), - "suitNature": self.nature_of_suit, - "cause": self.cause, - "juryDemand": self.jury_demand, - "jurisdictionType": self.jurisdiction_type, - } - if self.date_argued is not None: - out["dateArgued"] = midnight_pt(self.date_argued) - if self.date_filed is not None: - out["dateFiled"] = midnight_pt(self.date_filed) - if self.date_terminated is not None: - out["dateTerminated"] = midnight_pt(self.date_terminated) - try: - out["docket_absolute_url"] = self.get_absolute_url() - except NoReverseMatch: - raise InvalidDocumentError( - f"Unable to save to index due to missing absolute_url: {self.pk}" - ) - - # Judges - if self.assigned_to is not None: - out["assignedTo"] = self.assigned_to.name_full - elif self.assigned_to_str: - out["assignedTo"] = self.assigned_to_str - if self.referred_to is not None: - out["referredTo"] = self.referred_to.name_full - elif self.referred_to_str: - out["referredTo"] = self.referred_to_str - - # Court - out.update( - { - "court": self.court.full_name, - "court_exact": self.court_id, # For faceting - "court_citation_string": self.court.citation_string, - } - ) - - # Do RECAPDocument and Docket Entries in a nested loop - for de in self.docket_entries.all().iterator(): - # Docket Entry - de_out = { - "description": de.description, - } - if de.entry_number is not None: - de_out["entry_number"] = de.entry_number - if de.date_filed is not None: - de_out["entry_date_filed"] = midnight_pt(de.date_filed) - rds = de.recap_documents.all() - - if len(rds) == 0: - # Minute entry or other entry that lacks docs. - # For now, we punt. - # https://github.com/freelawproject/courtlistener/issues/784 - continue - - for rd in rds: - # IDs - rd_out = { - "id": rd.pk, - "docket_entry_id": de.pk, - "docket_id": self.pk, - "court_id": self.court.pk, - "assigned_to_id": getattr(self.assigned_to, "pk", None), - "referred_to_id": getattr(self.referred_to, "pk", None), - } - - # RECAPDocument - rd_out.update( - { - "short_description": rd.description, - "document_type": rd.get_document_type_display(), - "document_number": rd.document_number or None, - "attachment_number": rd.attachment_number, - "is_available": rd.is_available, - "page_count": rd.page_count, - } - ) - if rd.filepath_local: - rd_out["filepath_local"] = rd.filepath_local.name - try: - rd_out["absolute_url"] = rd.get_absolute_url() - except NoReverseMatch: - raise InvalidDocumentError( - "Unable to save to index due to missing " - f"absolute_url: {self.pk}" - ) - - text_template = loader.get_template("indexes/dockets_text.txt") - rd_out["text"] = text_template.render({"item": rd}).translate( - null_map - ) - - # Ensure that loops to bleed into each other - out_copy = out.copy() - out_copy.update(rd_out) - out_copy.update(de_out) - - search_list.append(normalize_search_dicts(out_copy)) - - return search_list - def reprocess_recap_content(self, do_original_xml: bool = False) -> None: """Go over any associated RECAP files and reprocess them. @@ -1730,57 +1613,6 @@ def get_docket_metadata(self): ) return out - def as_search_dict(self, docket_metadata=None): - """Create a dict that can be ingested by Solr. - - Search results are presented as Dockets, but they're indexed as - RECAPDocument's, which are then grouped back together in search results - to form Dockets. - - Since it's common to update an entire docket, there's a shortcut, - get_docket_metadata that lets you query that information first and then - pass it in as an argument so that it doesn't have to be queried for - every RECAPDocument on the docket. This can provide big performance - boosts. - """ - out = docket_metadata or self.get_docket_metadata() - - # IDs - out.update({"id": self.pk, "docket_entry_id": self.docket_entry.pk}) - - # RECAPDocument - out.update( - { - "short_description": self.description, - "document_type": self.get_document_type_display(), - "document_number": self.document_number or None, - "attachment_number": self.attachment_number, - "is_available": self.is_available, - "page_count": self.page_count, - } - ) - if self.filepath_local: - out["filepath_local"] = self.filepath_local.name - - try: - out["absolute_url"] = self.get_absolute_url() - except NoReverseMatch: - raise InvalidDocumentError( - f"Unable to save to index due to missing absolute_url: {self.pk}" - ) - - # Docket Entry - out["description"] = self.docket_entry.description - if self.docket_entry.entry_number is not None: - out["entry_number"] = self.docket_entry.entry_number - if self.docket_entry.date_filed is not None: - out["entry_date_filed"] = midnight_pt(self.docket_entry.date_filed) - - text_template = loader.get_template("indexes/dockets_text.txt") - out["text"] = text_template.render({"item": self}).translate(null_map) - - return normalize_search_dicts(out) - def get_csv_columns(self, get_column_name=False): columns = [ "id", @@ -3085,112 +2917,6 @@ async def asave( **kwargs, ) - def as_search_list(self): - # IDs - out = {} - - # Court - court = { - "court_id": self.docket.court.pk, - "court": self.docket.court.full_name, - "court_citation_string": self.docket.court.citation_string, - "court_exact": self.docket.court_id, - } - out.update(court) - - # Docket - docket = { - "docket_id": self.docket_id, - "docketNumber": self.docket.docket_number, - } - if self.docket.date_argued is not None: - docket["dateArgued"] = midnight_pt(self.docket.date_argued) - if self.docket.date_reargued is not None: - docket["dateReargued"] = midnight_pt(self.docket.date_reargued) - if self.docket.date_reargument_denied is not None: - docket["dateReargumentDenied"] = midnight_pt( - self.docket.date_reargument_denied - ) - out.update(docket) - - # Cluster - out.update( - { - "cluster_id": self.pk, - "caseName": best_case_name(self), - "caseNameShort": self.case_name_short, - "panel_ids": [judge.pk for judge in self.panel.all()], - "non_participating_judge_ids": [ - judge.pk for judge in self.non_participating_judges.all() - ], - "judge": self.judges, - "citation": [str(cite) for cite in self.citations.all()], - "scdb_id": self.scdb_id, - "source": self.source, - "attorney": self.attorneys, - "suitNature": self.nature_of_suit, - "citeCount": self.citation_count, - "status": self.get_precedential_status_display(), - "status_exact": self.get_precedential_status_display(), - "sibling_ids": [ - sibling.pk for sibling in self.sub_opinions.all() - ], - } - ) - try: - out["lexisCite"] = str( - self.citations.filter(type=Citation.LEXIS)[0] - ) - except IndexError: - pass - try: - out["neutralCite"] = str( - self.citations.filter(type=Citation.NEUTRAL)[0] - ) - except IndexError: - pass - - if self.date_filed is not None: - out["dateFiled"] = midnight_pt(self.date_filed) - try: - out["absolute_url"] = self.get_absolute_url() - except NoReverseMatch: - raise InvalidDocumentError( - "Unable to save to index due to missing absolute_url " - "(court_id: %s, item.pk: %s). Might the court have in_use set " - "to False?" % (self.docket.court_id, self.pk) - ) - - # Opinion - search_list = [] - text_template = loader.get_template("indexes/opinion_text.txt") - for opinion in self.sub_opinions.all(): - # Always make a copy to get a fresh version above metadata. Failure - # to do this pushes metadata from previous iterations to objects - # where it doesn't belong. - out_copy = out.copy() - out_copy.update( - { - "id": opinion.pk, - "cites": [o.pk for o in opinion.opinions_cited.all()], - "author_id": getattr(opinion.author, "pk", None), - "joined_by_ids": [j.pk for j in opinion.joined_by.all()], - "type": opinion.type, - "download_url": opinion.download_url or None, - "local_path": deepgetattr(self, "local_path.name", None), - "text": text_template.render( - { - "item": opinion, - "citation_string": self.citation_string, - } - ).translate(null_map), - } - ) - - search_list.append(normalize_search_dicts(out_copy)) - - return search_list - @pghistory.track( pghistory.InsertEvent(), pghistory.DeleteEvent(), obj_field=None @@ -3564,107 +3290,6 @@ def save( self.clean() super().save(*args, **kwargs) - def as_search_dict(self) -> Dict[str, Any]: - """Create a dict that can be ingested by Solr.""" - # IDs - out = { - "id": self.pk, - "docket_id": self.cluster.docket.pk, - "cluster_id": self.cluster.pk, - "court_id": self.cluster.docket.court.pk, - } - - # Opinion - out.update( - { - "cites": [opinion.pk for opinion in self.opinions_cited.all()], - "author_id": getattr(self.author, "pk", None), - # 'per_curiam': self.per_curiam, - "joined_by_ids": [judge.pk for judge in self.joined_by.all()], - "type": self.type, - "download_url": self.download_url or None, - "local_path": deepgetattr(self, "local_path.name", None), - } - ) - - # Cluster - out.update( - { - "caseName": best_case_name(self.cluster), - "caseNameShort": self.cluster.case_name_short, - "sibling_ids": [sibling.pk for sibling in self.siblings.all()], - "panel_ids": [judge.pk for judge in self.cluster.panel.all()], - "non_participating_judge_ids": [ - judge.pk - for judge in self.cluster.non_participating_judges.all() - ], - "judge": self.cluster.judges, - "citation": [ - str(cite) for cite in self.cluster.citations.all() - ], - "scdb_id": self.cluster.scdb_id, - "source": self.cluster.source, - "attorney": self.cluster.attorneys, - "suitNature": self.cluster.nature_of_suit, - "citeCount": self.cluster.citation_count, - "status": self.cluster.get_precedential_status_display(), - "status_exact": self.cluster.get_precedential_status_display(), - } - ) - try: - out["lexisCite"] = str( - self.cluster.citations.filter(type=Citation.LEXIS)[0] - ) - except IndexError: - pass - - try: - out["neutralCite"] = str( - self.cluster.citations.filter(type=Citation.NEUTRAL)[0] - ) - except IndexError: - pass - - if self.cluster.date_filed is not None: - out["dateFiled"] = midnight_pt(self.cluster.date_filed) - try: - out["absolute_url"] = self.cluster.get_absolute_url() - except NoReverseMatch: - raise InvalidDocumentError( - "Unable to save to index due to missing absolute_url " - "(court_id: %s, item.pk: %s). Might the court have in_use set " - "to False?" % (self.cluster.docket.court_id, self.pk) - ) - - # Docket - docket = {"docketNumber": self.cluster.docket.docket_number} - if self.cluster.docket.date_argued is not None: - docket["dateArgued"] = midnight_pt(self.cluster.docket.date_argued) - if self.cluster.docket.date_reargued is not None: - docket["dateReargued"] = midnight_pt( - self.cluster.docket.date_reargued - ) - if self.cluster.docket.date_reargument_denied is not None: - docket["dateReargumentDenied"] = midnight_pt( - self.cluster.docket.date_reargument_denied - ) - out.update(docket) - - court = { - "court": self.cluster.docket.court.full_name, - "court_citation_string": self.cluster.docket.court.citation_string, - "court_exact": self.cluster.docket.court_id, # For faceting - } - out.update(court) - - # Load the document text using a template for cleanup and concatenation - text_template = loader.get_template("indexes/opinion_text.txt") - out["text"] = text_template.render( - {"item": self, "citation_string": self.cluster.citation_string} - ).translate(null_map) - - return normalize_search_dicts(out) - @pghistory.track( pghistory.InsertEvent(), pghistory.DeleteEvent(), obj_field=None From 5f859b44b1036cedaf44a7d63f928a48422ca9b2 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:51:17 -0400 Subject: [PATCH 19/67] feat(lib): Clean up module by removing scorched utils --- cl/lib/scorched_utils.py | 155 --------------------------------------- 1 file changed, 155 deletions(-) delete mode 100644 cl/lib/scorched_utils.py diff --git a/cl/lib/scorched_utils.py b/cl/lib/scorched_utils.py deleted file mode 100644 index f564673fc2..0000000000 --- a/cl/lib/scorched_utils.py +++ /dev/null @@ -1,155 +0,0 @@ -from scorched import SolrInterface -from scorched.exc import SolrError -from scorched.search import Options, SolrSearch - - -class ExtraSolrInterface(SolrInterface): - """Extends the SolrInterface class so that it uses the ExtraSolrSearch - class. - """ - - hl_fields = None - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def query(self, *args, **kwargs): - """ - :returns: SolrSearch -- A solrsearch. - - Build a solr query - """ - # Change this line to hit our class instead of SolrSearch. All the rest - # of this class is the same. - q = ExtraSolrSearch(self) - if len(args) + len(kwargs) > 0: - return q.query(*args, **kwargs) - else: - return q - - def mlt_query(self, hl_fields, *args, **kwargs): - """ - :returns: MoreLikeThisHighlightsSolrSearch -- A MoreLikeThis search with highlights. - - Build a solr MLT query - """ - self.hl_fields = hl_fields - q = MoreLikeThisHighlightsSolrSearch(self) - - if len(args) + len(kwargs) > 0: - res = q.query(*args, **kwargs) - else: - res = q - - return res - - -class ExtraSolrSearch(SolrSearch): - """Base class for common search options management""" - - option_modules = ( - "query_obj", - "filter_obj", - "paginator", - "more_like_this", - "highlighter", - "postings_highlighter", - "faceter", - "grouper", - "sorter", - "facet_querier", - "debugger", - "spellchecker", - "requesthandler", - "field_limiter", - "parser", - "pivoter", - "facet_ranger", - "term_vectors", - "stat", - "extra", - ) - - def _init_common_modules(self): - super()._init_common_modules() - self.extra = ExtraOptions() - - def add_extra(self, **kwargs): - newself = self.clone() - newself.extra.update(kwargs) - return newself - - _count = None - - def count(self): - if self._count is None: - # We haven't gotten the count yet. Get it. Clone self for this - # query or else we'll set rows=0 for remainder. - newself = self.clone() - r = newself.add_extra(rows=0).execute() - if r.groups: - total = getattr(r.groups, r.group_field)["ngroups"] - else: - total = r.result.numFound - - # Set the cache - self._count = total - return self._count - - -class ExtraOptions(Options): - def __init__(self, original=None): - if original is None: - self.option_dict = {} - else: - self.option_dict = original.option_dict.copy() - - def update(self, extra_options): - self.option_dict.update(extra_options) - - def options(self): - return self.option_dict - - -class MoreLikeThisHighlightsSolrSearch(ExtraSolrSearch): - """ - By default Solr MoreLikeThis queries do not support highlighting. Thus, we need to produce the highlights in Python. - - A MoreLikeThis search with highlight fields that are taken directly from search results - """ - - # Limit length of text field - text_max_length = 500 - - def execute(self, constructor=None): - """ - Execute MLT-query and add highlighting to MLT search results. - """ - - try: - ret = self.interface.mlt_search(**self.options()) - except TypeError: - # Catch exception when seed is not available - raise SolrError( - "Seed documents for MoreLikeThis query do not exist" - ) - - # Add solr_highlighting to MLT results - for doc in ret: - # Initialize empty highlights dict - doc["solr_highlights"] = {} - - # Copy each highlight field - for field_name in self.interface.hl_fields: - if field_name in doc: - if field_name == "text": # max text length - doc[field_name] = doc[field_name][ - : self.text_max_length - ] - - doc["solr_highlights"][field_name] = [doc[field_name]] - - if constructor: - ret = self.constructor(ret, constructor) - - return ret From 559d657233e31a872f50db62d4da48595e49267a Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 19:58:05 -0400 Subject: [PATCH 20/67] feat(citations): Removes command to add parallel citations --- .../commands/add_parallel_citations.py | 332 ------------------ 1 file changed, 332 deletions(-) delete mode 100644 cl/citations/management/commands/add_parallel_citations.py diff --git a/cl/citations/management/commands/add_parallel_citations.py b/cl/citations/management/commands/add_parallel_citations.py deleted file mode 100644 index 1874fd6971..0000000000 --- a/cl/citations/management/commands/add_parallel_citations.py +++ /dev/null @@ -1,332 +0,0 @@ -import sys - -import networkx as nx -from celery.canvas import group -from django.conf import settings -from django.core.management import CommandError, call_command -from django.db import IntegrityError -from eyecite.find import get_citations -from eyecite.tokenizers import HyperscanTokenizer - -from cl.citations.annotate_citations import get_and_clean_opinion_text -from cl.citations.match_citations import build_date_range -from cl.citations.tasks import identify_parallel_citations -from cl.citations.utils import get_years_from_reporter -from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.scorched_utils import ExtraSolrInterface -from cl.search.models import Opinion, OpinionCluster - -HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") - -# Parallel citations need to be identified this many times before they should -# be added to the database. -EDGE_RELEVANCE_THRESHOLD = 20 - - -def make_edge_list(group): - """Convert a list of parallel citations into a list of tuples. - - This satisfied networkx. - """ - out = [] - for i, citation in enumerate(group): - try: - t = (citation, group[i + 1]) - except IndexError: - # End of the list - break - else: - out.append(t) - return out - - -class Command(VerboseCommand): - help = ( - "Parse the entire corpus, identifying parallel citations. Add them " - "to the database if sufficiently accurate and requested by the " - "user." - ) - - def __init__(self, stdout=None, stderr=None, no_color=False): - super().__init__(stdout=None, stderr=None, no_color=False) - self.g = nx.Graph() - self.conn = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") - self.update_count = 0 - - def add_arguments(self, parser): - parser.add_argument( - "--update_database", - action="store_true", - default=False, - help="Save changes to the database", - ) - parser.add_argument( - "--update_solr", - action="store_true", - default=False, - help="Update Solr after updating the database", - ) - parser.add_argument( - "--all", - action="store_true", - default=False, - help="Parse citations for all items", - ) - parser.add_argument( - "--doc-id", - type=int, - nargs="*", - help="ids of citing opinions", - ) - - def match_on_citation(self, citation): - """Attempt to identify the item referred to by the citation.""" - main_params = { - "fq": [ - "status:Precedential", - f'citation:("{citation.corrected_citation()}"~5)', - ], - "caller": "citation.add_parallel_citations", - } - - if citation.year: - start_year = end_year = citation.year - else: - start_year, end_year = get_years_from_reporter(citation) - main_params["fq"].append( - f"dateFiled:{build_date_range(start_year, end_year)}" - ) - - if citation.court: - main_params["fq"].append(f"court_exact:{citation.court}") - - # Query Solr - return self.conn.query().add_extra(**main_params).execute() - - def handle_subgraph(self, sub_graph, options): - """Add edges to the database if significant. - - An earlier version of the code simply looked at each edge, but this - looks at sub_graphs within the main graph. This is different (and - better) because the main graph might have multiple nodes like so: - - A <-- (22 US 33): This node is in the DB already - | - B <-- (2013 LEXIS 223948): This node is not yet in the DB - | - C <-- (2013 WL 3808347): This node is not yet in the DB - | - D <-- This node can be disregarded because it has low edge weight. - - If we handled this edge by edge, we might process B --> C before doing - A --> B. If we did that, we'd get zero results for B and C, and we'd - add nothing. That'd be bad, since there's a strong edge between A, B, - and C. - - Instead, we process this as a graph, looking at all the nodes at once. - """ - # Remove nodes that are only connected weakly. - for node in sub_graph.nodes(): - has_good_edge = False - for a, b, data in sub_graph.edges([node], data=True): - if data["weight"] > EDGE_RELEVANCE_THRESHOLD: - has_good_edge = True - break - if not has_good_edge: - sub_graph.remove_node(node) - - if len(sub_graph.nodes()) == 0: - logger.info(" No strong edges found. Pass.\n") - return - - # Look up all remaining nodes in Solr, and make a (node, results) pair. - result_sets = [] - for node in sub_graph.nodes(): - result_sets.append((node, self.match_on_citation(node))) - - if sum(len(results) for node, results in result_sets) == 0: - logger.info(" Got no results for any citation. Pass.\n") - return - - if all(len(results) > 0 for node, results in result_sets): - logger.info(" Got results for all citations. Pass.\n") - return - - # Remove any node-results pairs with more than than one result. - result_sets = list( - filter( - lambda n, r: len(r) > 1, - result_sets, - ) - ) - - # For result_sets with more than 0 results, do all the citations have - # the same ID? - unique_results = { - results[0]["cluster_id"] - for node, results in result_sets - if len(results) > 0 - } - if len(unique_results) > 1: - logger.info(" Got multiple IDs for the citations. Pass.\n") - return - - # Are the number of unique reporters equal to the number of results? - if len( - {node.edition_guess.reporter for node, results in result_sets} - ) != len(result_sets): - logger.info(" Got duplicated reporter in citations. Pass.\n") - return - - # Get the cluster. By now we know all results have either 0 or 1 item. - oc = None - for node, results in result_sets: - if len(results) > 0: - oc = OpinionCluster.objects.get(pk=results[0]["cluster_id"]) - break - - if oc is not None: - # Update the cluster with all the nodes that had no results. - for node, results in result_sets: - if len(results) != 0: - continue - - # Create citation objects - c = node.to_model() - c.cluster = oc - self.update_count += 1 - if not options["update_database"]: - continue - - try: - c.save() - except IntegrityError: - logger.info( - "Unable to save '%s' to cluster '%s' due to " - "an IntegrityError. Probably the cluster " - "already has this citation", - c, - oc, - ) - - def add_groups_to_network(self, citation_groups): - """Add the citation groups from an opinion to the global network - object, normalizing the Citation objects. - """ - for group in citation_groups: - edge_list = make_edge_list(group) - for edge in edge_list: - for e in edge: - # Alas, Idaho can be abbreviated as Id. This creates lots of - # problems, so if made a match on "Id." we simple move on. - # Ditto for Cr. (short for Cranch) - if e.groups["reporter"] in ["Id.", "Cr."]: - return - - if self.g.has_edge(*edge): - # Increment the weight of the edge. - self.g[edge[0]][edge[1]]["weight"] += 1 - else: - self.g.add_edge(*edge, weight=1) - - @staticmethod - def do_solr(options): - """Update Solr if requested, or report if not.""" - if options["update_solr"]: - # fmt: off - call_command( - 'cl_update_index', - '--type', 'search.Opinion', - '--solr-url', settings.SOLR_OPINION_URL, - '--noinput', - '--update', - '--everything', - '--do-commit', - ) - # fmt: on - else: - logger.info( - "\nSolr index not updated. You may want to do so " - "manually.\n" - ) - - def handle(self, *args, **options): - """Identify parallel citations and save them as requested. - - This process proceeds in two phases. The first phase is to work through - the entire corpus, identifying citations that occur very near to each - other. These are considered parallel citations, and they are built into - a graph data structure where citations are nodes and each parallel - citation is an edge. The weight of each edge is determined by the - number of times a parallel citation has been identified between two - citations. This should solve problems like typos or other issues with - our heuristic approach. - - The second phase of this process is to update the database with the - high quality citations. This can only be done by matching the citations - with actual items in the database and then updating them with parallel - citations that are sufficiently likely to be good. - """ - super().handle(*args, **options) - no_option = not any([options.get("doc_id"), options.get("all")]) - if no_option: - raise CommandError( - "Please specify if you want all items or a specific item." - ) - if not options["update_database"]: - logger.info( - "--update_database is not set. No changes will be made to the " - "database." - ) - - logger.info( - "## Entering phase one: Building a network object of " - "all citations.\n" - ) - opinions = Opinion.objects.all() - if options.get("doc_id"): - opinions = opinions.filter(pk__in=options["doc_id"]) - count = opinions.count() - - node_count = edge_count = completed = 0 - subtasks = [] - for o in opinions.iterator(): - subtasks.append( - identify_parallel_citations.s( - get_citations( - get_and_clean_opinion_text(o).cleaned_text, - tokenizer=HYPERSCAN_TOKENIZER, - ) - ) - ) - last_item = count == completed + 1 - if (completed % 50 == 0) or last_item: - job = group(subtasks) - result = job.apply_async().join() - [ - self.add_groups_to_network(citation_groups) - for citation_groups in result - ] - subtasks = [] - - completed += 1 - if completed % 250 == 0 or last_item: - # Only do this once in a while. - node_count = len(self.g.nodes()) - edge_count = len(self.g.edges()) - sys.stdout.write( - "\r Completed %s of %s. (%s nodes, %s edges)" - % (completed, count, node_count, edge_count) - ) - sys.stdout.flush() - - logger.info( - "\n\n## Entering phase two: Saving the best edges to " - "the database.\n\n" - ) - for sub_graph in nx.connected_component_subgraphs(self.g): - self.handle_subgraph(sub_graph, options) - - logger.info(f"\n\n## Done. Added {self.update_count} new citations.") - - self.do_solr(options) From c3e28022e76b7f357a5e68da142938040f650458 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 20:03:28 -0400 Subject: [PATCH 21/67] feat(people_db): Removes command to use FTM API --- .../management/commands/cl_get_ftm_ids.py | 240 ------------------ 1 file changed, 240 deletions(-) delete mode 100644 cl/people_db/management/commands/cl_get_ftm_ids.py diff --git a/cl/people_db/management/commands/cl_get_ftm_ids.py b/cl/people_db/management/commands/cl_get_ftm_ids.py deleted file mode 100644 index 8e145374f8..0000000000 --- a/cl/people_db/management/commands/cl_get_ftm_ids.py +++ /dev/null @@ -1,240 +0,0 @@ -import os -import pickle -from collections import defaultdict - -import requests -from django.conf import settings -from django.utils.timezone import now - -from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.scorched_utils import ExtraSolrInterface -from cl.people_db.import_judges.courtid_levels import courtid2statelevel -from cl.people_db.models import Person - -leveldict = { - "H": "J", # State Supreme Court - "M": "K", # State Appellate Court - "L": "D", # Lower Court -} - -url_template = ( - "https://api.followthemoney.org/?" - "f-core=1&" - "c-exi=1&" - "gro=c-t-id&" - "mode=json&" - "APIKey={key}&" - "s={state}&" - "c-r-ot={level}&" - "y={year}" -) - - -def make_dict_of_ftm_eids(use_pickle=True): - """Build up a dictionary mapping jurisdiction IDs to candidates in those - locations - """ - pickle_location = "/tmp/eid_lists.pkl" - if use_pickle: - if os.path.isfile(pickle_location): - with open(pickle_location, "r") as f: - logger.info( - "Loading pickled candidate list. Read the command " - "documentation if this is not desired." - ) - return pickle.load(f) - else: - logger.info("Unable to find pickle file.") - - candidate_eid_lists = defaultdict(list) - - for courtid, (state, level) in courtid2statelevel.items(): - if level != "H": - # We only want high courts. - continue - for year in range(1989, 2017): - url = url_template.format( - key=settings.FTM_KEY, - state=state, - level=leveldict[level], - year=year, - ) - logger.info(f"Getting url at: {url}") - data = requests.get(url, timeout=30).json() - - if data["records"] == ["No Records"]: - logger.info( - f" No records found in court {courtid} and year {year}." - ) - continue - logger.info( - " Found %s records in court %s and year %s" - % (len(data["records"]), courtid, year) - ) - - for item in data["records"]: - # add an eid, name, year tuple to this court's list - candidate_eid_lists[courtid].append( - { - "eid": item["Candidate_Entity"]["id"], - "name": item["Candidate_Entity"]["Candidate_Entity"], - "total": float(item["Total_$"]["Total_$"]), - "year": year, - } - ) - - if use_pickle: - with open(pickle_location, "w") as f: - logger.info(f"Creating pickle file at: {pickle_location}") - pickle.dump(candidate_eid_lists, f) - return candidate_eid_lists - - -def clear_old_values(do_it, debug): - """Clear out the old values in the ftm fields. If debug or do_it is False, - don't clear the values. - """ - if not do_it or debug: - return - logger.info("Clearing out all old values in FTM fields.") - Person.objects.all().update( - date_modified=now(), ftm_eid="", ftm_total_received=None - ) - - -def print_stats(match_stats, candidate_eid_lists): - """Print the stats.""" - logger.info("#########") - logger.info("# Stats #") - logger.info("#########") - logger.info("Finished matching judges:") - for k, v in match_stats.items(): - logger.info(f" - {v} had {k} matches") - ftm_judge_count = 0 - for v in candidate_eid_lists.values(): - ftm_judge_count += len(v) - logger.info( - f"There were {ftm_judge_count} judges in FTM that we matched against." - ) - - -def update_judges_by_solr(candidate_id_map, debug): - """Update judges by looking up each entity from FTM in Solr.""" - with requests.Session() as session: - conn = ExtraSolrInterface( - settings.SOLR_PEOPLE_URL, http_connection=session, mode="r" - ) - match_stats = defaultdict(int) - # These IDs are ones that cannot be updated due to being identified as - # problematic in FTM's data. - denylisted_ips = defaultdict(set) - for court_id, candidate_list in candidate_id_map.items(): - for candidate in candidate_list: - # Look up the candidate in Solr. - logger.info(f"Doing: {candidate['name']}") - name = ( - " AND ".join( - [ - word - for word in candidate["name"].split() - if len(word) > 1 - ] - ) - ).replace(",", "") - results = ( - conn.query() - .add_extra( - **{ - "caller": "ftm_update_judges_by_solr", - "fq": [ - f"name:({name})", - f"court_exact:{court_id}", - # This filters out Sr/Jr problems by insisting on recent - # positions. 1980 is arbitrary, based on testing. - "date_start:[1980-12-31T23:59:59Z TO *]", - ], - "q": "*", - } - ) - .execute() - ) - - if len(results) == 0: - match_stats[len(results)] += 1 - logger.info("Found no matches.") - - elif len(results) == 1: - match_stats[len(results)] += 1 - logger.info(f"Found one match: {results[0]['name']}") - - # Get the person from the DB and update them. - pk = results[0]["id"] - if pk in denylisted_ips: - continue - p = Person.objects.get(pk=pk) - if p.ftm_eid: - if p.ftm_eid != candidate["eid"]: - logger.info( - " Found values in ftm database fields. " - "This indicates a duplicate in FTM." - ) - - denylisted_ips[p.pk].add(candidate["eid"]) - denylisted_ips[p.pk].add(p.ftm_eid) - p.ftm_eid = "" - p.ftm_total_received = None - else: - logger.info( - "Found values with matching EID. Adding " - "amounts, since this indicates multiple " - "jurisdictions that the judge was in." - ) - p.ftm_total_received += candidate["total"] - if not debug: - p.save() - else: - # No major problems. Proceed. - p.ftm_eid = candidate["eid"] - p.ftm_total_received = candidate["total"] - if not debug: - p.save() - - elif len(results) > 1: - match_stats[len(results)] += 1 - logger.info(f" Found more than one match: {results}") - - print_stats(match_stats, candidate_id_map) - logger.info(f"Denylisted IDs: {denylisted_ips}") - - -class Command(VerboseCommand): - help = ( - "Use the Follow the Money API to lookup judges by name and " - "jurisdiction. Once looked up, save the ID to the DB." - ) - - def add_arguments(self, parser): - parser.add_argument( - "--debug", - action="store_true", - default=False, - help="Don't change the data.", - ) - parser.add_argument( - "--dont-use-pickle", - action="store_false", - default=True, - help="Don't use a pickle file if one exists.", - ) - parser.add_argument( - "--clear-old-values", - action="store_true", - default=False, - help="Clear out the old values before beginning.", - ) - - def handle(self, *args, **options): - super().handle(*args, **options) - candidate_id_map = make_dict_of_ftm_eids(options["dont_use_pickle"]) - clear_old_values(options["clear_old_values"], options["debug"]) - update_judges_by_solr(candidate_id_map, debug=options["debug"]) From 3105545f84ef38dfd57921b8370398b3892fc91c Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 20:05:59 -0400 Subject: [PATCH 22/67] feat(lib): Removes helpers related to SOLR administration --- cl/lib/solr_core_admin.py | 105 -------------------------------------- 1 file changed, 105 deletions(-) delete mode 100644 cl/lib/solr_core_admin.py diff --git a/cl/lib/solr_core_admin.py b/cl/lib/solr_core_admin.py deleted file mode 100644 index 70a3a1553b..0000000000 --- a/cl/lib/solr_core_admin.py +++ /dev/null @@ -1,105 +0,0 @@ -import io -import json -from typing import Dict, List, Union, cast - -import lxml -import requests -from django.conf import settings -from lxml.etree import _ElementTree -from scorched.exc import SolrError - - -def swap_solr_core( - current_core: str, - desired_core: str, - url: str = settings.SOLR_HOST, -) -> None: - """Swap cores, keeping on on deck for easy reversion. - - @current_core is the core you are currently using which will be swapped OUT. - @desired_core is the core you intend to make live which will be swapped IN. - """ - params = { - "wt": "json", - "action": "SWAP", - "core": current_core, - "other": desired_core, - } - r = requests.get(f"{url}/solr/admin/cores", params=params, timeout=30) - if r.status_code != 200: - print( - "Problem swapping cores. Got status_code of %s. " - "Check the Solr logs for details." % r.status_code - ) - - -def get_solr_core_status( - core: str = "all", - url: str = settings.SOLR_HOST, -) -> _ElementTree: - """Get the status for the solr core as an XML document.""" - if core == "all": - core_query = "" - else: - core_query = f"&core={core}" - r = requests.get( - f"{url}/solr/admin/cores?action=STATUS{core_query}", - timeout=10, - ) - if r.status_code != 200: - print( - "Problem getting the core status. Got status_code of %s. " - "Check the Solr logs for details." % r.status_code - ) - - try: - solr_config = lxml.etree.parse(io.BytesIO(r.content)) - except lxml.etree.XMLSyntaxError as e: - raise SolrError(f"Invalid XML in schema:\n{e.args[0]}") - - return solr_config - - -def get_term_frequency( - count: int = 500, - result_type: str = "dict", - field: str = "text", - url: str = settings.SOLR_HOST, -) -> Union[Dict[str, int], List[str]]: - """Get the term frequency in the index. - - result_type can be json, list or dict. - """ - params = { - "fl": field, - "numTerms": str(count), - "wt": "json", - } - r = requests.get(f"{url}/solr/admin/luke", params=params, timeout=10) - content_as_json = json.loads(r.content) - if result_type == "list": - if len(content_as_json["fields"]) == 0: - return [] - else: - top_terms = [] - for result in content_as_json["fields"]["text"]["topTerms"]: - # Top terms is a list of alternating terms and counts. Their - # types are different, so we'll use that. - if isinstance(result, str): - top_terms.append(result) - return top_terms - elif result_type == "dict": - if len(content_as_json["fields"]) == 0: - return {} - else: - top_terms_dict = {} - for result in content_as_json["fields"]["text"]["topTerms"]: - # We set aside the term until we reach its count, then we add - # them as a k,v pair - if isinstance(result, str): - key = result - else: - top_terms_dict[key] = result - return top_terms_dict - else: - raise ValueError("Unknown output type!") From 8feb6c37b3f5c09ed2aec9f8bbce2ddeacb3d9f4 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 20:13:06 -0400 Subject: [PATCH 23/67] feat(lib): Remove unused normalize_search_dicts helper --- cl/lib/search_index_utils.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/cl/lib/search_index_utils.py b/cl/lib/search_index_utils.py index 8e4653cb5b..4e837f85d7 100644 --- a/cl/lib/search_index_utils.py +++ b/cl/lib/search_index_utils.py @@ -28,32 +28,6 @@ def __init__(self, message): ) -def normalize_search_dicts(d): - """Prepare search dicts for indexing by solr. - - 1. Remove any kv from a dictionary if v is None - - This is needed to send dictionaries to Scorched, instead of - sending objects, and should provide a performance improvement. If you try - to send None values to integer fields (for example), things break, b/c - integer fields shouldn't be getting None values. Fair 'nuf. - - 2. Convert any sets to lists. - - This is needed because sets aren't JSON serializable, but they're - convenient to use when building up a search object. - """ - new_dict = {} - for k, v in d.items(): - if v is None: - continue - if isinstance(v, set): - new_dict[k] = list(v) - else: - new_dict[k] = v - return new_dict - - def get_parties_from_case_name(case_name: str) -> list[str]: """Extracts the parties from case_name by splitting on common case_name separators. From 92b87c81cb3829f8c9b423f3444ad9feca0e9b11 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:03:56 -0400 Subject: [PATCH 24/67] refactor(recap_rss): Simplify merge_rss_feed_contents This commit streamlines the merge_rss_feed_contents helper by removing the unnecessary return of Recap document IDs. It now returns only the relevant tuples containing docket IDs. --- cl/recap_rss/tasks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cl/recap_rss/tasks.py b/cl/recap_rss/tasks.py index 82b5cd2f24..c80ac96198 100644 --- a/cl/recap_rss/tasks.py +++ b/cl/recap_rss/tasks.py @@ -311,7 +311,9 @@ async def cache_hash(item_hash): @app.task(bind=True, max_retries=1) -def merge_rss_feed_contents(self, feed_data, court_pk, metadata_only=False): +def merge_rss_feed_contents( + self, feed_data, court_pk, metadata_only=False +) -> list[tuple[int, datetime]]: """Merge the rss feed contents into CourtListener :param self: The Celery task @@ -319,9 +321,7 @@ def merge_rss_feed_contents(self, feed_data, court_pk, metadata_only=False): already queried the feed and been parsed. :param court_pk: The CourtListener court ID. :param metadata_only: Whether to only do metadata and skip docket entries. - :returns Dict containing keys: - d_pks_to_alert: A list of (docket, alert_time) tuples for sending alerts - rds_for_solr: A list of RECAPDocument PKs for updating in Solr + :returns A list of (docket ids, alert_time) tuples for sending alerts """ start_time = now() @@ -374,13 +374,13 @@ def merge_rss_feed_contents(self, feed_data, court_pk, metadata_only=False): all_rds_created.extend([rd.pk for rd in rds_created]) logger.info( - "%s: Sending %s new RECAP documents to Solr for indexing and " + "%s: Sending %s new RECAP documents for indexing and " "sending %s dockets for alerts.", court_pk, len(all_rds_created), len(d_pks_to_alert), ) - return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created} + return d_pks_to_alert @app.task From 725152621a699729757c1389c92cce178f009ec9 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:10:53 -0400 Subject: [PATCH 25/67] feat(alerts): Simplify send_alerts_and_webhooks This commit streamlines the send_alerts_and_webhooks method by removing the return of Recap document IDs. It now returns an empty list to indicate successful completion. --- cl/alerts/tasks.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/cl/alerts/tasks.py b/cl/alerts/tasks.py index e183a0c914..ac20fa6f58 100644 --- a/cl/alerts/tasks.py +++ b/cl/alerts/tasks.py @@ -373,27 +373,20 @@ def send_alert_and_webhook( @app.task(ignore_result=True) -def send_alerts_and_webhooks( - data: Dict[str, Union[List[Tuple], List[int]]] -) -> List[int]: +def send_alerts_and_webhooks(data: list[tuple[int, datetime]]) -> List[int]: """Send many docket alerts at one time without making numerous calls to the send_alert_and_webhook function. - :param data: A dict with up to two keys: + :param data: A list of tuples. Each tuple contains the docket ID, and + a time. The time indicates that alerts should be sent for + items *after* that point. - d_pks_to_alert: A list of tuples. Each tuple contains the docket ID, and - a time. The time indicates that alerts should be sent for - items *after* that point. - rds_for_solr: A list of RECAPDocument ids that need to be sent to Solr - to be made searchable. - :returns: Simply passes through the rds_for_solr list, in case it is - consumed by the next task. If rds_for_solr is not provided, returns an - empty list. + :returns: An empty list """ - for args in data["d_pks_to_alert"]: + for args in data: send_alert_and_webhook(*args) - return cast(List[int], data.get("rds_for_solr", [])) + return [] @app.task(ignore_result=True) From f782e7841744420aebf8f44dd878026197201955 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:16:40 -0400 Subject: [PATCH 26/67] feat(citations): Clean up Scorched imports This commit removes the unnecessary type import from Scorched. --- cl/citations/match_citations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cl/citations/match_citations.py b/cl/citations/match_citations.py index c561518f28..d113770ae4 100644 --- a/cl/citations/match_citations.py +++ b/cl/citations/match_citations.py @@ -16,7 +16,6 @@ ) from eyecite.test_factories import case_citation from eyecite.utils import strip_punct -from scorched.response import SolrResponse from cl.citations.match_citations_queries import es_search_db_for_full_citation from cl.citations.types import ( @@ -65,7 +64,7 @@ def resolve_fullcase_citation( ) -> MatchedResourceType: # Case 1: FullCaseCitation if type(full_citation) is FullCaseCitation: - db_search_results: SolrResponse | list[Hit] + db_search_results: list[Hit] db_search_results, _ = es_search_db_for_full_citation(full_citation) # If there is one search result, try to return it if len(db_search_results) == 1: From 7189f877126cf15e06ef4a1c1c434fd52fb1a622 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:18:17 -0400 Subject: [PATCH 27/67] feat(citations): Remove unused SOLR date range helper This commit removes the helper method for building SOLR date ranges, as it's no longer needed. --- cl/citations/match_citations.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cl/citations/match_citations.py b/cl/citations/match_citations.py index d113770ae4..84570e405d 100644 --- a/cl/citations/match_citations.py +++ b/cl/citations/match_citations.py @@ -32,14 +32,6 @@ NO_MATCH_RESOURCE = Resource(case_citation(source_text="UNMATCHED_CITATION")) -def build_date_range(start_year: int, end_year: int) -> str: - """Build a date range to be handed off to a solr query.""" - start = datetime(start_year, 1, 1) - end = datetime(end_year, 12, 31) - date_range = f"[{start.isoformat()}Z TO {end.isoformat()}Z]" - return date_range - - def filter_by_matching_antecedent( opinion_candidates: Iterable[Opinion], antecedent_guess: Optional[str], From 292344598ee4ad2c6e831899ce77db32072398a4 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:25:13 -0400 Subject: [PATCH 28/67] feat(citations): Remove SOLR dependency from count_citations command --- .../management/commands/count_citations.py | 54 +------------------ 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/cl/citations/management/commands/count_citations.py b/cl/citations/management/commands/count_citations.py index decac005a5..01fdd01ead 100644 --- a/cl/citations/management/commands/count_citations.py +++ b/cl/citations/management/commands/count_citations.py @@ -1,8 +1,3 @@ -import sys - -from django.conf import settings -from django.core.management import call_command - from cl.lib.command_utils import VerboseCommand from cl.search.models import OpinionCluster @@ -17,48 +12,6 @@ def add_arguments(self, parser): nargs="*", help="ids to process one by one, if desired", ) - parser.add_argument( - "--index", - type=str, - default="all-at-end", - choices=("all-at-end", "concurrently", "False"), - help=( - "When/if to save changes to the Solr index. Options are " - "all-at-end, concurrently or False. Saving 'concurrently' " - "is least efficient, since each document is updated once " - "for each citation to it, however this setting will show " - "changes in the index in realtime. Saving 'all-at-end' can " - "be considerably more efficient, but will not show changes " - "until the process has finished and the index has been " - "completely regenerated from the database. Setting this to " - "False disables changes to Solr, if that is what's desired. " - "Finally, only 'concurrently' will avoid reindexing the " - "entire collection. If you are only updating a subset of " - "the opinions, it is thus generally wise to use " - "'concurrently'." - ), - ) - - @staticmethod - def do_solr(options): - """Update Solr if requested, or report if not.""" - if options["index"] == "all-at-end": - # fmt: off - call_command( - 'cl_update_index', - '--type', 'search.Opinion', - '--solr-url', settings.SOLR_OPINION_URL, - '--noinput', - '--update', - '--everything', - '--do-commit', - ) - # fmt: on - elif options["index"] == "False": - sys.stdout.write( - "Solr index not updated after running citation " - "finder. You may want to do so manually." - ) def handle(self, *args, **options): """ @@ -66,9 +19,6 @@ def handle(self, *args, **options): count based on the DB. """ super().handle(*args, **options) - index_during_processing = False - if options["index"] == "concurrently": - index_during_processing = True clusters = OpinionCluster.objects.filter(citation_count__gt=0) if options.get("doc_id"): @@ -80,6 +30,4 @@ def handle(self, *args, **options): count += sub_opinion.citing_opinions.all().count() cluster.citation_count = count - cluster.save(index=index_during_processing) - - self.do_solr(options) + cluster.save() From 117e53266f62a14b723a2512f331eb96a3b1c8fb Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:35:27 -0400 Subject: [PATCH 29/67] feat(citations): Removes SOLR code from find_citations command --- .../management/commands/find_citations.py | 52 +------------------ 1 file changed, 2 insertions(+), 50 deletions(-) diff --git a/cl/citations/management/commands/find_citations.py b/cl/citations/management/commands/find_citations.py index 4935c1a3f8..ec23e3e913 100644 --- a/cl/citations/management/commands/find_citations.py +++ b/cl/citations/management/commands/find_citations.py @@ -2,8 +2,7 @@ import time from typing import Iterable, List, cast -from django.conf import settings -from django.core.management import CommandError, call_command +from django.core.management import CommandError from django.core.management.base import CommandParser from cl.citations.tasks import ( @@ -60,27 +59,6 @@ def add_arguments(self, parser: CommandParser) -> None: default=False, help="Parse citations for all items", ) - parser.add_argument( - "--index", - type=str, - default="all-at-end", - choices=("all-at-end", "concurrently", "False"), - help=( - "When/if to save changes to the Solr index. Options are " - "all-at-end, concurrently or False. Saving 'concurrently' " - "is least efficient, since each document is updated once " - "for each citation to it, however this setting will show " - "changes in the index in realtime. Saving 'all-at-end' can " - "be considerably more efficient, but will not show changes " - "until the process has finished and the index has been " - "completely regenerated from the database. Setting this to " - "False disables changes to Solr, if that is what's desired. " - "Finally, only 'concurrently' will avoid reindexing the " - "entire collection. If you are only updating a subset of " - "the opinions, it is thus generally wise to use " - "'concurrently'." - ), - ) parser.add_argument( "--queue", default="batch1", @@ -114,8 +92,6 @@ def handle(self, *args: List[str], **options: OptionsType) -> None: "everything." ) - self.index = options["index"] - # Use query chaining to build the query query = Opinion.objects.all().order_by("pk") if options.get("doc_id"): @@ -141,7 +117,6 @@ def handle(self, *args: List[str], **options: OptionsType) -> None: self.timings: List[float] = [] opinion_pks = query.values_list("pk", flat=True).iterator() self.update_documents(opinion_pks, cast(str, options["queue"])) - self.add_to_solr(cast(str, options["queue"])) def log_progress(self, processed_count: int, last_pk: int) -> None: if processed_count % 1000 == 1: @@ -171,10 +146,6 @@ def update_documents(self, opinion_pks: Iterable, queue_name: str) -> None: sys.stdout.write(f"Graph size is {self.count:d} nodes.\n") sys.stdout.flush() - index_during_subtask = False - if self.index == "concurrently": - index_during_subtask = True - chunk = [] chunk_size = 100 processed_count = 0 @@ -186,28 +157,9 @@ def update_documents(self, opinion_pks: Iterable, queue_name: str) -> None: chunk.append(opinion_pk) if processed_count % chunk_size == 0 or last_item: find_citations_and_parentheticals_for_opinion_by_pks.apply_async( - args=(chunk, index_during_subtask), + args=(chunk, True), queue=queue_name, ) chunk = [] self.log_progress(processed_count, opinion_pk) - - def add_to_solr(self, queue_name: str) -> None: - if self.index == "all-at-end": - # fmt: off - call_command( - 'cl_update_index', - '--type', 'search.Opinion', - '--solr-url', settings.SOLR_OPINION_URL, - '--noinput', - '--update', - '--everything', - '--queue', queue_name, - ) - # fmt: on - elif self.index == "False": - sys.stdout.write( - "Solr index not updated after running citation " - "finder. You may want to do so manually." - ) From fffdbb4e9e1b8e7a067a9c0f7d454799d758ebb6 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:40:30 -0400 Subject: [PATCH 30/67] feat(corpus_importer): Simplify recap_document_into_opinions helper This commit removes the unnecessary `add_to_solr` argument from the `recap_document_into_opinions` helper --- cl/corpus_importer/management/commands/recap_into_opinions.py | 2 +- cl/corpus_importer/tasks.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cl/corpus_importer/management/commands/recap_into_opinions.py b/cl/corpus_importer/management/commands/recap_into_opinions.py index 95b6d5c5fe..9cb4054c22 100644 --- a/cl/corpus_importer/management/commands/recap_into_opinions.py +++ b/cl/corpus_importer/management/commands/recap_into_opinions.py @@ -84,7 +84,7 @@ def import_opinions_from_recap( ) throttle.maybe_wait() recap_document_into_opinions.apply_async( - args=[{}, recap_document.id, add_to_solr], queue=queue + args=[{}, recap_document.id], queue=queue ) count += 1 if total_count > 0 and count >= total_count: diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index d0760cb2e9..a00a5e4448 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -2739,7 +2739,6 @@ def recap_document_into_opinions( self, task_data: Optional[TaskData] = None, recap_document_id: Optional[int] = None, - add_to_solr: bool = False, ) -> Optional[TaskData]: """Ingest recap document into Opinions @@ -2748,7 +2747,6 @@ def recap_document_into_opinions( command. This task should be chained after the PDF has been downloaded from PACER :param recap_document_id: The document id to inspect and import - :param add_to_solr: Whether to add to solr :return: The same `task_data` that came as input """ From 52df1b4a45956557f2a6bbe0706e8df01c15c40d Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:44:12 -0400 Subject: [PATCH 31/67] feat(corpus_import): Simplify anon_2020_import command This commit removes the unused flag from the `anon_2020_import` command --- .../management/commands/anon_2020_import.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cl/corpus_importer/management/commands/anon_2020_import.py b/cl/corpus_importer/management/commands/anon_2020_import.py index e6c3ad6ce3..9c0f98cbc8 100644 --- a/cl/corpus_importer/management/commands/anon_2020_import.py +++ b/cl/corpus_importer/management/commands/anon_2020_import.py @@ -359,7 +359,6 @@ def process_dates( def import_anon_2020_db( import_dir: str, skip_until: Optional[str], - make_searchable: Optional[bool], ) -> None: """Import data from anon 2020 DB into our system. @@ -371,7 +370,6 @@ def import_anon_2020_db( :param import_dir: Location of directory of import data. :param skip_until: ID for case we should begin processing, if any. - :param make_searchable: Should we add content to SOLR. :return: None. """ directories = iglob(f"{import_dir}/*/????-*.json") @@ -432,12 +430,6 @@ class Command(VerboseCommand): help = "Import anon 2020 DB." def add_arguments(self, parser): - parser.add_argument( - "--make-searchable", - action="store_true", - help="Add items to solr as we create opinions. " - "Items are not searchable unless flag is raised.", - ) parser.add_argument( "--import-dir", default="cl/assets/media/x-db/all_dir/", @@ -455,5 +447,4 @@ def add_arguments(self, parser): def handle(self, *args, **options): skip_until = options["skip_until"] import_dir = options["import_dir"] - make_searchable = options["make_searchable"] - import_anon_2020_db(import_dir, skip_until, make_searchable) + import_anon_2020_db(import_dir, skip_until) From 45a1567d50c27c9943cf4d8865678d2c740ca2bc Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:48:36 -0400 Subject: [PATCH 32/67] feat(corpus_importer): Simplify harvard_opinions command This commit removes the `make_searchable` argument from the `harvard_opinions` command --- .../management/commands/harvard_opinions.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/cl/corpus_importer/management/commands/harvard_opinions.py b/cl/corpus_importer/management/commands/harvard_opinions.py index 01a9454ea1..322ac787d5 100644 --- a/cl/corpus_importer/management/commands/harvard_opinions.py +++ b/cl/corpus_importer/management/commands/harvard_opinions.py @@ -228,7 +228,6 @@ class OptionsType(TypedDict): page: str court_id: Optional[str] location: Optional[str] - make_searchable: bool bankruptcy: bool @@ -292,7 +291,7 @@ def parse_harvard_opinions(options: OptionsType) -> None: If neither is provided, code will cycle through all downloaded files. :param options: The command line options including (reporter, - volume court_id and make_searchable) + volume and court_id) :return: None """ @@ -300,7 +299,6 @@ def parse_harvard_opinions(options: OptionsType) -> None: volumes = options["volumes"] page = options["page"] court_id = options["court_id"] - make_searchable = options["make_searchable"] is_bankruptcy = options["bankruptcy"] if not reporter and volumes: @@ -420,7 +418,6 @@ def parse_harvard_opinions(options: OptionsType) -> None: citation, court_id, file_path, - make_searchable, ) @@ -435,7 +432,6 @@ def add_new_case( citation: FullCaseCitation, court_id: Optional[str], file_path: str, - make_searchable: bool, ) -> None: """Add new case to Courtlistener.com @@ -449,7 +445,6 @@ def add_new_case( :param citation: The citation we use in logging and first citation parsed :param court_id: The CL Court ID :param file_path: The path to the Harvard JSON - :param make_searchable: Should we add this case to SOLR :return: None """ soup = BeautifulSoup(case_body, "lxml") @@ -729,12 +724,6 @@ def add_arguments(self, parser): required=False, default=None, ) - parser.add_argument( - "--make-searchable", - action="store_true", - help="Add items to solr as we create opinions. " - "Items are not searchable unless flag is raised.", - ) parser.add_argument( "--bankruptcy", action="store_true", From 834d487ced2d41bf038959033cc8a10aa0d0fe08 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:51:54 -0400 Subject: [PATCH 33/67] feat(corpus_importer): Tweaks recap_into_opinions command This commit removes the add-to-solr argument from the recap_into_opinions command --- .../management/commands/recap_into_opinions.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cl/corpus_importer/management/commands/recap_into_opinions.py b/cl/corpus_importer/management/commands/recap_into_opinions.py index 9cb4054c22..ef3182d3c4 100644 --- a/cl/corpus_importer/management/commands/recap_into_opinions.py +++ b/cl/corpus_importer/management/commands/recap_into_opinions.py @@ -15,7 +15,6 @@ def import_opinions_from_recap( total_count: int = 0, queue: str = "batch1", db_connection: str = "default", - add_to_solr: bool = False, ) -> None: """Import recap documents into opinion db @@ -25,7 +24,6 @@ def import_opinions_from_recap( :param total_count: The number of new opinions to add :param queue: The queue to use for celery :param db_connection: The db to use - :param add_to_solr: Whether to add to solr :return: None """ court_query = Court.objects.using(db_connection) @@ -140,12 +138,6 @@ def add_arguments(self, parser): default=False, help="Use this flag to run the queries in the replica db", ) - parser.add_argument( - "--add-to-solr", - action="store_true", - default=False, - help="Use this flag to add items to solr", - ) def handle(self, *args, **options): jurisdiction = options.get("jurisdiction") @@ -153,7 +145,6 @@ def handle(self, *args, **options): skip_until = options.get("skip_until") total_count = options.get("total") queue = options.get("queue") - add_to_solr = options.get("add_to_solr") db_connection = ( "replica" if options.get("use_replica") and "replica" in settings.DATABASES @@ -167,5 +158,4 @@ def handle(self, *args, **options): total_count, queue, db_connection, - add_to_solr, ) From 0736a8c949bb760466c0da8a4aace16712fc84ad Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 21:54:19 -0400 Subject: [PATCH 34/67] docs(corpus_importer): Improve get_docket_and_claims docstring --- cl/corpus_importer/task_canvases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/corpus_importer/task_canvases.py b/cl/corpus_importer/task_canvases.py index 58da086029..e662f1a81d 100644 --- a/cl/corpus_importer/task_canvases.py +++ b/cl/corpus_importer/task_canvases.py @@ -20,8 +20,8 @@ def get_docket_and_claims( docket_number, court, case_name, cookies_data, tags, q ): - """Get the docket report, claims history report, and save it all to the DB - and Solr + """ + Get the docket report, claims history report, and save it all to the DB """ chain( get_pacer_case_id_and_title.s( From f1930a0cb5796c5a6b99ef953dd1dda2a80bbb65 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 22:05:01 -0400 Subject: [PATCH 35/67] feat(corpus_importer): Simplifies scrape_pacer_free_opinions This commit removes the `index` argument from the `scrape_pacer_free_opinions` command. --- .../commands/scrape_pacer_free_opinions.py | 33 ++++--------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index 4fd020e42b..11df582d9b 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -252,7 +252,6 @@ def get_pdfs( courts: list[Optional[str]], date_start: datetime.date, date_end: datetime.date, - index: bool, queue: str, ) -> None: """Get PDFs for the results of the Free Document Report queries. @@ -269,7 +268,6 @@ def get_pdfs( courts :param date_end: optionally an end date to query all the specified courts or all courts - :param index: true if we should index as we process the data or do it later :param queue: the queue name :return: None """ @@ -304,8 +302,6 @@ def get_pdfs( ) count = rows.count() task_name = "downloading" - if index: - task_name += " and indexing" logger.info( f"{task_name} {count} items from PACER from {date_start} to {date_end}." ) @@ -358,11 +354,10 @@ def get_pdfs( ) -def ocr_available(queue: str, index: bool) -> None: - """Do the OCR for any items that need it, then save to the solr index. +def ocr_available(queue: str) -> None: + """Do the OCR for any items that need it, then save to the ES index. :param queue: the queue name - :param index: true if we should index as we process the data or do it later """ q = cast(str, queue) rds = ( @@ -375,19 +370,12 @@ def ocr_available(queue: str, index: bool) -> None: throttle = CeleryThrottle(queue_name=q) for i, pk in enumerate(rds): throttle.maybe_wait() - if index: - extract_recap_pdf.si(pk, ocr_available=True).set( - queue=q - ).apply_async() - else: - extract_recap_pdf.si(pk, ocr_available=True).set( - queue=q - ).apply_async() + extract_recap_pdf.si(pk, ocr_available=True).set(queue=q).apply_async() if i % 1000 == 0: logger.info(f"Sent {i + 1}/{count} tasks to celery so far.") -def do_everything(courts, date_start, date_end, index, queue): +def do_everything(courts, date_start, date_end, queue): """Execute the entire process of obtaining the metadata of the free documents, downloading them and ingesting them into the system @@ -396,15 +384,14 @@ def do_everything(courts, date_start, date_end, index, queue): courts :param date_end: optionally an end date to query all the specified courts or all courts - :param index: true if we should index as we process the data or do it later :param queue: the queue name """ logger.info("Running and compiling free document reports.") get_and_save_free_document_reports(courts, date_start, date_end) logger.info("Getting PDFs from free document reports") - get_pdfs(courts, date_start, date_end, index, queue) - logger.info("Doing OCR and saving items to Solr.") - ocr_available(queue, index) + get_pdfs(courts, date_start, date_end, queue) + logger.info("Doing OCR and saving items.") + ocr_available(queue) class Command(VerboseCommand): @@ -465,12 +452,6 @@ def add_arguments(self, parser: argparse.ArgumentParser) -> None: default="pacerdoc1", help="The celery queue where the tasks should be processed.", ) - parser.add_argument( - "--index", - action="store_true", - default=False, - help="Do we index as we go, or leave that to be done later?", - ) parser.add_argument( "--courts", type=str, From 7193269c839db6602cf0951e19cc1601d80c5105 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 22:20:56 -0400 Subject: [PATCH 36/67] feat(citations): Simplifies citation storage and parenthetical update This commit streamlines the helper method for storing citations and updating parentheticals by removing the `index` argument. --- cl/citations/tasks.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py index 189f2a3bbf..aa2f002334 100644 --- a/cl/citations/tasks.py +++ b/cl/citations/tasks.py @@ -102,7 +102,7 @@ def find_citations_and_parantheticals_for_recap_documents( try: store_recap_citations(d) except ResponseNotReady as e: - # Threading problem in httplib, which is used in the Solr query. + # Threading problem in httplib. raise self.retry(exc=e, countdown=2) @@ -110,12 +110,10 @@ def find_citations_and_parantheticals_for_recap_documents( def find_citations_and_parentheticals_for_opinion_by_pks( self, opinion_pks: List[int], - index: bool = True, ) -> None: """Find citations and authored parentheticals for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs - :param index: Whether to add the items to Solr :return: None """ opinions: QuerySet[Opinion, Opinion] = Opinion.objects.filter( @@ -123,20 +121,19 @@ def find_citations_and_parentheticals_for_opinion_by_pks( ) for opinion in opinions: try: - store_opinion_citations_and_update_parentheticals(opinion, index) + store_opinion_citations_and_update_parentheticals(opinion) except ResponseNotReady as e: - # Threading problem in httplib, which is used in the Solr query. + # Threading problem in httplib. raise self.retry(exc=e, countdown=2) def store_opinion_citations_and_update_parentheticals( - opinion: Opinion, index: bool + opinion: Opinion, ) -> None: """ Updates counts of citations to other opinions within a given court opinion, as well as parenthetical info for the cited opinions. :param opinion: A search.Opinion object. - :param index: Whether to add the item to Solr :return: None """ From b8ad666a8f04ec1719d70a11933118234e5f31c3 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 9 Dec 2024 22:29:53 -0400 Subject: [PATCH 37/67] feat(Opinion): Remove index argument from save method --- cl/citations/tasks.py | 6 +++--- cl/corpus_importer/import_columbia/populate_opinions.py | 2 +- .../management/commands/harvard_opinions.py | 3 +-- cl/scrapers/management/commands/cl_scrape_opinions.py | 2 +- cl/scrapers/tasks.py | 8 ++++---- cl/search/models.py | 2 -- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py index aa2f002334..dfd0d273c2 100644 --- a/cl/citations/tasks.py +++ b/cl/citations/tasks.py @@ -206,7 +206,7 @@ def store_opinion_citations_and_update_parentheticals( ) # Finally, commit these changes to the database in a single - # transcation block. Trigger a single Solr update as well, if + # transcation block. Trigger a single update as well, if # required. with transaction.atomic(): opinion_clusters_to_update = OpinionCluster.objects.filter( @@ -240,8 +240,8 @@ def store_opinion_citations_and_update_parentheticals( OpinionCluster.objects.get(pk=cluster_id) ) - # Save all the changes to the citing opinion (send to solr later) - opinion.save(index=False) + # Save all the changes to the citing opinion + opinion.save() # Update changes in ES. cluster_ids_to_update = list( diff --git a/cl/corpus_importer/import_columbia/populate_opinions.py b/cl/corpus_importer/import_columbia/populate_opinions.py index 2a73b401db..dfbbac11e9 100644 --- a/cl/corpus_importer/import_columbia/populate_opinions.py +++ b/cl/corpus_importer/import_columbia/populate_opinions.py @@ -452,7 +452,7 @@ def make_and_save( cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster - opinion.save(index=False) + opinion.save() for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: diff --git a/cl/corpus_importer/management/commands/harvard_opinions.py b/cl/corpus_importer/management/commands/harvard_opinions.py index 322ac787d5..257bee0854 100644 --- a/cl/corpus_importer/management/commands/harvard_opinions.py +++ b/cl/corpus_importer/management/commands/harvard_opinions.py @@ -597,8 +597,7 @@ def add_opinions( per_curiam=per_curiam, extracted_by_ocr=True, ) - # Don't index now; do so later if desired - op.save(index=False) + op.save() new_op_pks.append(op.pk) return new_op_pks diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 8fe42e893a..181216d3fc 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -210,7 +210,7 @@ def save_everything( cluster.panel.add(candidate) opinion.cluster = cluster - opinion.save(index=index) + opinion.save() if not backscrape: RealTimeQueue.objects.create( item_type=SEARCH_TYPES.OPINION, item_pk=opinion.pk diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index b45dcd922e..a6cea9c986 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -194,19 +194,19 @@ def extract_doc_content( ) return - # Save item, and index Solr if needed. + # Save item # noinspection PyBroadException try: opinion.cluster.docket.save() - opinion.cluster.save(index=False) + opinion.cluster.save() if not citation_jitter: # No waiting around. Save to the database now, but don't bother # with the index yet because citations are being done imminently. - opinion.save(index=False) + opinion.save() else: # Save to the index now, citations come later, commit comes # according to schedule - opinion.save(index=True) + opinion.save() except Exception: logger.error( "****Error saving text to the db for: %s****\n%s", diff --git a/cl/search/models.py b/cl/search/models.py index c84cc0c907..6c99398d95 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3282,8 +3282,6 @@ def clean(self) -> None: def save( self, - index: bool = True, - force_commit: bool = False, *args: List, **kwargs: Dict, ) -> None: From 6f8b5faec175f8818f14d8e9ee3d43e7f09de2be Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 00:43:03 -0400 Subject: [PATCH 38/67] docs(api): Updates helper method docstring --- cl/api/tasks.py | 4 ++-- cl/api/webhooks.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cl/api/tasks.py b/cl/api/tasks.py index 39c5fe7533..fb1d0a45a3 100644 --- a/cl/api/tasks.py +++ b/cl/api/tasks.py @@ -93,7 +93,7 @@ def send_es_search_alert_webhook( """Send a search alert webhook event containing search results from a search alert object. - :param results: The search results returned by SOLR for this alert. + :param results: The search results returned for this alert. :param webhook_pk: The webhook endpoint ID object to send the event to. :param alert: The search alert object. """ @@ -134,7 +134,7 @@ def send_search_alert_webhook_es( """Send a search alert webhook event containing search results from a search alert object. - :param results: The search results returned by SOLR for this alert. + :param results: The search results returned for this alert. :param webhook_pk: The webhook endpoint ID object to send the event to. :param alert_pk: The search alert ID. """ diff --git a/cl/api/webhooks.py b/cl/api/webhooks.py index fd5267f60b..6c985cd5aa 100644 --- a/cl/api/webhooks.py +++ b/cl/api/webhooks.py @@ -166,7 +166,7 @@ def send_search_alert_webhook( """Send a search alert webhook event containing search results from a search alert object. - :param results: The search results returned by SOLR for this alert. + :param results: The search results returned for this alert. :param webhook: The webhook endpoint object to send the event to. :param alert: The search alert object. """ From ee956aece127f88d4736f1926bdeb611f4f6abed Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 00:43:43 -0400 Subject: [PATCH 39/67] docs(search): Tweaks helper function comments --- cl/search/constants.py | 2 +- cl/search/forms.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/search/constants.py b/cl/search/constants.py index 7dc0ccb03e..f9d8b610f3 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -1,4 +1,4 @@ -# Solr fields that are used for highlighting or other output in the search results +# fields that are used for highlighting or other output in the search results import re from typing import Dict diff --git a/cl/search/forms.py b/cl/search/forms.py index 07bc8b77db..b218b88510 100644 --- a/cl/search/forms.py +++ b/cl/search/forms.py @@ -634,7 +634,7 @@ def clean(self): cleaned_data["_court_count"] = len(court_bools) cleaned_data["_stat_count"] = len(stat_bools) - # 4. Strip any whitespace, otherwise it crashes Solr. + # 4. Strip any whitespace, otherwise it crashes. for k, v in cleaned_data.items(): if isinstance(v, str): cleaned_data[k] = v.strip() From 6d2816bde154cfe26301e164d8e55c3e1b25a85f Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 00:48:44 -0400 Subject: [PATCH 40/67] docs(citations): Updates make_name_param comment --- cl/citations/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/citations/utils.py b/cl/citations/utils.py index a555cf1e84..d7e699a85a 100644 --- a/cl/citations/utils.py +++ b/cl/citations/utils.py @@ -84,7 +84,7 @@ def make_name_param( if plaintiff: token_list.extend(plaintiff.split()) - # Strip out punctuation, which Solr doesn't like + # Strip out punctuation query_words = [strip_punct(t) for t in token_list] return " ".join(query_words), len(query_words) From 4b698f36e3d97122f5dc2449c33d59d325d4e2f0 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 00:49:15 -0400 Subject: [PATCH 41/67] feat(scrapers): Refines extract_doc_content method --- cl/scrapers/tasks.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index a6cea9c986..b53fb9a419 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -199,14 +199,7 @@ def extract_doc_content( try: opinion.cluster.docket.save() opinion.cluster.save() - if not citation_jitter: - # No waiting around. Save to the database now, but don't bother - # with the index yet because citations are being done imminently. - opinion.save() - else: - # Save to the index now, citations come later, commit comes - # according to schedule - opinion.save() + opinion.save() except Exception: logger.error( "****Error saving text to the db for: %s****\n%s", From 9cffcbb1cb23ad2ee865a06dff612e482a982623 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 00:49:56 -0400 Subject: [PATCH 42/67] docs(recap): Refines helper methods docstring --- cl/recap/tasks.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index d4780763d4..b9461eb130 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -548,8 +548,7 @@ async def process_recap_docket(pk): // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or - // recap document was created (implying a Solr needs - // updating). + // recap document was created 'content_updated': True, } @@ -849,7 +848,7 @@ async def process_recap_docket_history_report(pk): """Process the docket history report. :param pk: The primary key of the processing queue item you want to work on - :returns: A dict indicating whether the docket needs Solr re-indexing. + :returns: A dict indicating whether the docket needs re-indexing. """ start_time = now() pq = await ProcessingQueue.objects.aget(pk=pk) @@ -967,7 +966,7 @@ async def process_case_query_page(pk): """Process the case query (iquery.pl) page. :param pk: The primary key of the processing queue item you want to work on - :returns: A dict indicating whether the docket needs Solr re-indexing. + :returns: A dict indicating whether the docket needs re-indexing. """ pq = await ProcessingQueue.objects.aget(pk=pk) @@ -1023,7 +1022,7 @@ async def process_case_query_page(pk): d.add_recap_source() await update_docket_metadata(d, data) - # Update the docket in SOLR if the case name has changed and contains + # Update the docket if the case name has changed and contains # docket entries content_updated = False if current_case_name != d.case_name and d.pk: @@ -1103,8 +1102,7 @@ async def process_recap_appellate_docket(pk): // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or - // recap document was created (implying a Solr needs - // updating). + // recap document was created 'content_updated': True, } @@ -1217,8 +1215,7 @@ async def process_recap_acms_docket(pk): // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or - // recap document was created (implying a Solr needs - // updating). + // recap document was created. 'content_updated': True, } From df1a826af2d567e4bd9e88703e940248e1877fc9 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 00:51:42 -0400 Subject: [PATCH 43/67] docs(lib): Updates add_depth_counts docstring --- cl/lib/search_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/lib/search_utils.py b/cl/lib/search_utils.py index b0709ff50e..15bc810c9a 100644 --- a/cl/lib/search_utils.py +++ b/cl/lib/search_utils.py @@ -159,13 +159,13 @@ async def add_depth_counts( search_results: Page, ) -> OpinionCluster | None: """If the search data contains a single "cites" term (e.g., "cites:(123)"), - calculate and append the citation depth information between each Solr/ES + calculate and append the citation depth information between each ES result and the cited OpinionCluster. We only do this for *single* "cites" terms to avoid the complexity of trying to render multiple depth relationships for all the possible result-citation combinations. :param search_data: The cleaned search form data - :param search_results: The paginated Solr/ES results + :param search_results: The paginated ES results :return The OpinionCluster if the lookup was successful """ From d529104ab592c1321d9686fde06adc1a2afd8e8f Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 01:14:28 -0400 Subject: [PATCH 44/67] feat(OpinionCluster): Remove indexing-related arguments from save method --- cl/corpus_importer/management/commands/anon_2020_import.py | 2 +- cl/corpus_importer/management/commands/harvard_opinions.py | 2 +- cl/opinion_page/views.py | 2 +- cl/scrapers/management/commands/cl_scrape_opinions.py | 2 +- cl/search/models.py | 2 -- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/management/commands/anon_2020_import.py b/cl/corpus_importer/management/commands/anon_2020_import.py index 9c0f98cbc8..88cc05aa2a 100644 --- a/cl/corpus_importer/management/commands/anon_2020_import.py +++ b/cl/corpus_importer/management/commands/anon_2020_import.py @@ -207,7 +207,7 @@ def add_new_records( correction=data["publication_status_note"] or "", judges=judges.replace("{", "").replace("}", "") or "", ) - cluster.save(index=False) + cluster.save() for citation in found_citations: logger.info("Adding citation for: %s", citation.corrected_citation()) diff --git a/cl/corpus_importer/management/commands/harvard_opinions.py b/cl/corpus_importer/management/commands/harvard_opinions.py index 257bee0854..25ea66db37 100644 --- a/cl/corpus_importer/management/commands/harvard_opinions.py +++ b/cl/corpus_importer/management/commands/harvard_opinions.py @@ -534,7 +534,7 @@ def add_new_case( judges=judges, filepath_json_harvard=file_path, ) - cluster.save(index=False) + cluster.save() logger.info("Saving cluster for: %s", cluster.id) logger.info("Adding citation for: %s", citation.corrected_citation()) diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index 741cda6a03..a787a74c5f 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -1699,7 +1699,7 @@ async def block_item(request: HttpRequest) -> HttpResponse: if cluster is not None: cluster.blocked = True cluster.date_blocked = now() - await cluster.asave(index=False) + await cluster.asave() docket_pk = ( pk diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 181216d3fc..d26ad612af 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -185,7 +185,7 @@ def save_everything( opinion, citations = items["opinion"], items["citations"] docket.save() cluster.docket = docket - cluster.save(index=False) # Index only when the opinion is associated. + cluster.save() for citation in citations: citation.cluster_id = cluster.pk diff --git a/cl/search/models.py b/cl/search/models.py index 6c99398d95..029b9b016b 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2891,8 +2891,6 @@ def ordered_opinions(self): def save( self, update_fields=None, - index=True, - force_commit=False, *args, **kwargs, ): From 4d7c603ee6b5d4eeb6f1d7f61dd726ee865065e3 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 01:19:03 -0400 Subject: [PATCH 45/67] feat(people_db): Removes custom actions from PersonAdmin class --- cl/people_db/admin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cl/people_db/admin.py b/cl/people_db/admin.py index 18ab755a3e..411a4cb8f9 100644 --- a/cl/people_db/admin.py +++ b/cl/people_db/admin.py @@ -131,7 +131,6 @@ class PersonAdmin(admin.ModelAdmin, AdminTweaksMixin): ) raw_id_fields = ("is_alias_of",) readonly_fields = ("has_photo",) - actions = ("update_in_solr", "delete_from_solr") @admin.register(Race) From eafd29519d650a6dd9d634895dadf92e30687420 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 01:20:38 -0400 Subject: [PATCH 46/67] refactor(recap_rss): Remove index argument from reprocess_item --- cl/recap_rss/models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/recap_rss/models.py b/cl/recap_rss/models.py index 1fb7f9eb81..a765d5cc57 100644 --- a/cl/recap_rss/models.py +++ b/cl/recap_rss/models.py @@ -101,13 +101,11 @@ def print_file_contents(self) -> None: def reprocess_item( self, metadata_only: bool = False, - index: bool = True, ) -> None: """Reprocess the RSS feed :param metadata_only: If True, only do the metadata, not the docket entries. - :param index: Whether to save to Solr (note that none will be sent when doing medata only since no entries are modified). """ from cl.recap_rss.tasks import merge_rss_feed_contents From 0e769711c1ced9860b16cce9a5ed894f0433b173 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 01:26:05 -0400 Subject: [PATCH 47/67] refactor(search): Remove scorched import --- cl/search/tasks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/search/tasks.py b/cl/search/tasks.py index da98f63003..bffc72e119 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -5,7 +5,6 @@ from random import randint from typing import Any, Generator -import scorched from celery import Task from celery.canvas import chain from django.apps import apps @@ -29,7 +28,6 @@ ) from elasticsearch_dsl import Document, Q, UpdateByQuery, connections from requests import Session -from scorched.exc import SolrError from cl.alerts.tasks import ( percolator_response_processing, From 4e7ad07ae6fca710a38b1aafa86e06c97847f33b Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 08:59:59 -0400 Subject: [PATCH 48/67] Refactor(lib): Rename helper function and improve docstring Renamed `solr_list` to `extract_solr_field_values` to better reflect its purpose. Improved the docstring to provide a clearer explanation of the function's behavior, including input parameters, output, and specific handling of `datetime.date` objects. --- cl/lib/search_index_utils.py | 15 ++++++++++++++- cl/search/tests/tests_es_person.py | 30 +++++++++++++++++++----------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/cl/lib/search_index_utils.py b/cl/lib/search_index_utils.py index 4e837f85d7..7d6d7feb5b 100644 --- a/cl/lib/search_index_utils.py +++ b/cl/lib/search_index_utils.py @@ -3,7 +3,20 @@ from cl.lib.date_time import midnight_pt -def solr_list(m2m_list, field): +def extract_field_values(m2m_list, field): + """Extracts values from a list of objects. + + This function iterates over a list of objects, extracts the specified field value + from each object, and returns a new list of values. + If the field value is a `datetime.date` object, it is converted to midnight Pacific Time. + + Args: + m2m_list: A list of objects. + field_name: The name of the field to extract values from. + + Returns: + A list of extracted field values + """ new_list = [] for obj in m2m_list: obj = getattr(obj, field) diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py index eb82285286..fe728adcfc 100644 --- a/cl/search/tests/tests_es_person.py +++ b/cl/search/tests/tests_es_person.py @@ -12,7 +12,7 @@ from lxml import html from cl.lib.elasticsearch_utils import build_es_base_query, build_es_main_query -from cl.lib.search_index_utils import solr_list +from cl.lib.search_index_utils import extract_field_values from cl.lib.test_helpers import ( CourtTestCase, PeopleTestCase, @@ -464,39 +464,47 @@ def test_merge_unavailable_fields_api(self) -> None: positions = self.person_2.positions.all() self.assertEqual( Counter(r.data["results"][0]["date_nominated"]), - Counter(solr_list(positions, "date_nominated")), + Counter(extract_field_values(positions, "date_nominated")), ) self.assertEqual( Counter(r.data["results"][0]["date_elected"]), - Counter(solr_list(positions, "date_elected")), + Counter(extract_field_values(positions, "date_elected")), ) self.assertEqual( Counter(r.data["results"][0]["date_recess_appointment"]), - Counter(solr_list(positions, "date_recess_appointment")), + Counter( + extract_field_values(positions, "date_recess_appointment") + ), ) self.assertEqual( Counter( r.data["results"][0]["date_referred_to_judicial_committee"] ), Counter( - solr_list(positions, "date_referred_to_judicial_committee") + extract_field_values( + positions, "date_referred_to_judicial_committee" + ) ), ) self.assertEqual( Counter(r.data["results"][0]["date_judicial_committee_action"]), - Counter(solr_list(positions, "date_judicial_committee_action")), + Counter( + extract_field_values( + positions, "date_judicial_committee_action" + ) + ), ) self.assertEqual( Counter(r.data["results"][0]["date_hearing"]), - Counter(solr_list(positions, "date_hearing")), + Counter(extract_field_values(positions, "date_hearing")), ) self.assertEqual( Counter(r.data["results"][0]["date_confirmation"]), - Counter(solr_list(positions, "date_confirmation")), + Counter(extract_field_values(positions, "date_confirmation")), ) self.assertEqual( Counter(r.data["results"][0]["date_start"]), - Counter(solr_list(positions, "date_start")), + Counter(extract_field_values(positions, "date_start")), ) self.assertEqual( Counter(r.data["results"][0]["date_granularity_start"]), @@ -510,11 +518,11 @@ def test_merge_unavailable_fields_api(self) -> None: ) self.assertEqual( Counter(r.data["results"][0]["date_retirement"]), - Counter(solr_list(positions, "date_retirement")), + Counter(extract_field_values(positions, "date_retirement")), ) self.assertEqual( Counter(r.data["results"][0]["date_termination"]), - Counter(solr_list(positions, "date_termination")), + Counter(extract_field_values(positions, "date_termination")), ) self.assertEqual( Counter(r.data["results"][0]["date_granularity_termination"]), From b4ef22368f4999f1a7d4fca9d540616a0be45a4d Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 09:27:41 -0400 Subject: [PATCH 49/67] fix(audio): Removes index argument from postgeneration hook --- cl/audio/factories.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cl/audio/factories.py b/cl/audio/factories.py index af689715e0..a068bc8047 100644 --- a/cl/audio/factories.py +++ b/cl/audio/factories.py @@ -51,7 +51,6 @@ def _after_postgeneration(cls, instance, create, results=None): if create and results: # Some post-generation hooks ran, and may have modified the instance. instance.save( - index=False, update_fields=["local_path_mp3", "local_path_original_file"], ) From 0126936d4f3b486f912a259bd24ef375d389c794 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 09:32:49 -0400 Subject: [PATCH 50/67] refactor(scraper): Tweaks save_everything helper in scrape_opinions - Removed the `index` argument from `save_everything` - Updated usage of `save_everything` to reflect the change --- cl/corpus_importer/management/commands/import_tn.py | 3 +-- cl/opinion_page/forms.py | 3 +-- cl/scrapers/management/commands/cl_back_scrape_opinions.py | 4 ++-- cl/scrapers/management/commands/cl_scrape_opinions.py | 4 +--- cl/scrapers/management/commands/cl_scrape_oral_arguments.py | 4 +--- 5 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_tn.py b/cl/corpus_importer/management/commands/import_tn.py index 59c31ed5b5..826911e3ef 100644 --- a/cl/corpus_importer/management/commands/import_tn.py +++ b/cl/corpus_importer/management/commands/import_tn.py @@ -132,8 +132,7 @@ def import_tn_corpus( "opinion": opinion, "cluster": cluster, "citations": citations, - }, - index=False, + } ) extract_doc_content.delay( diff --git a/cl/opinion_page/forms.py b/cl/opinion_page/forms.py index c4eefd75c1..5d9008b251 100644 --- a/cl/opinion_page/forms.py +++ b/cl/opinion_page/forms.py @@ -513,8 +513,7 @@ def save(self) -> OpinionCluster: "opinion": opinion, "cluster": cluster, "citations": citations, - }, - index=False, + } ) if self.cleaned_data.get("lower_court_docket_number"): diff --git a/cl/scrapers/management/commands/cl_back_scrape_opinions.py b/cl/scrapers/management/commands/cl_back_scrape_opinions.py index e077552da9..c30ed3cbeb 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_back_scrape_opinions.py @@ -85,5 +85,5 @@ def parse_and_scrape_site( ) time.sleep(wait) - def save_everything(self, items, index=False, backscrape=True): - super().save_everything(items, index, backscrape) + def save_everything(self, items, backscrape=True): + super().save_everything(items, backscrape) diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index d26ad612af..9200537ec7 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -177,7 +177,6 @@ def make_objects( @transaction.atomic def save_everything( items: Dict[str, Any], - index: bool = False, backscrape: bool = False, ) -> None: """Saves all the sub items and associates them as appropriate.""" @@ -368,8 +367,7 @@ def ingest_a_case( "opinion": opinion, "cluster": cluster, "citations": citations, - }, - index=False, + } ) extract_doc_content.delay( opinion.pk, diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index 62377a98ec..c2f59250e9 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -30,13 +30,12 @@ @transaction.atomic def save_everything( items: Dict[str, Union[Docket, Audio]], - index: bool = False, backscrape: bool = False, ) -> None: docket, af = items["docket"], items["audio_file"] docket.save() af.docket = docket - af.save(index=index) + af.save() candidate_judges = [] if af.docket.court_id != "scotus": if af.judges: @@ -143,7 +142,6 @@ def ingest_a_case( save_everything( items={"docket": docket, "audio_file": audio_file}, - index=False, backscrape=backscrape, ) process_audio_file.delay(audio_file.pk) From ec065ee537eba23358cb3a7c6a37c63f03e5d8bd Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 09:35:03 -0400 Subject: [PATCH 51/67] feat(OpinionCluster): Removes index argument from async save --- cl/corpus_importer/import_columbia/populate_opinions.py | 2 +- cl/search/models.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cl/corpus_importer/import_columbia/populate_opinions.py b/cl/corpus_importer/import_columbia/populate_opinions.py index dfbbac11e9..86721b3e77 100644 --- a/cl/corpus_importer/import_columbia/populate_opinions.py +++ b/cl/corpus_importer/import_columbia/populate_opinions.py @@ -444,7 +444,7 @@ def make_and_save( try: docket.save() cluster.docket = docket - cluster.save(index=False) + cluster.save() for citation in found_citations: citation.cluster = cluster citation.save() diff --git a/cl/search/models.py b/cl/search/models.py index 029b9b016b..11e01e8a1c 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2902,15 +2902,11 @@ def save( async def asave( self, update_fields=None, - index=True, - force_commit=False, *args, **kwargs, ): return await sync_to_async(self.save)( update_fields=update_fields, - index=index, - force_commit=force_commit, *args, **kwargs, ) From ebcbf6c9cba7337d22c4afcc85acb610010afe82 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 09:59:12 -0400 Subject: [PATCH 52/67] refactor(scraper): Remove index argument from save method --- cl/scrapers/tasks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index b53fb9a419..d94ca48713 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -320,9 +320,7 @@ async def extract_recap_pdf_base( rd.ocr_status = RECAPDocument.OCR_NEEDED rd.plain_text, _ = anonymize(content) - # Do not do indexing here. Creates race condition in celery. await rd.asave( - index=False, do_extraction=False, update_fields=["ocr_status", "plain_text"], ) From d569ed8ac5e2f2843a6ec7bcf7e4de12f213da0c Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:00:05 -0400 Subject: [PATCH 53/67] docs(alerts): Improve docstring for remove_stale_rt_items --- cl/alerts/management/commands/cl_send_alerts.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_alerts.py b/cl/alerts/management/commands/cl_send_alerts.py index 9a69b70cba..720bb638b3 100644 --- a/cl/alerts/management/commands/cl_send_alerts.py +++ b/cl/alerts/management/commands/cl_send_alerts.py @@ -295,8 +295,6 @@ def clean_rt_queue(self): def remove_stale_rt_items(self, age=2): """Remove anything old from the RTQ. - This helps avoid issues with solr hitting the maxboolean clause errors. - :param age: How many days old should items be before we start deleting them? """ From 09a18de20fd8d1a8e5423b531049ffd98103335b Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:08:45 -0400 Subject: [PATCH 54/67] fix(citations): Adjust find_citations command to match signature change --- cl/citations/management/commands/find_citations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/citations/management/commands/find_citations.py b/cl/citations/management/commands/find_citations.py index ec23e3e913..37bb191b58 100644 --- a/cl/citations/management/commands/find_citations.py +++ b/cl/citations/management/commands/find_citations.py @@ -157,7 +157,7 @@ def update_documents(self, opinion_pks: Iterable, queue_name: str) -> None: chunk.append(opinion_pk) if processed_count % chunk_size == 0 or last_item: find_citations_and_parentheticals_for_opinion_by_pks.apply_async( - args=(chunk, True), + args=(chunk,), queue=queue_name, ) chunk = [] From 2838d78b52715bb9e8480693d647b448b1971ba7 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:11:13 -0400 Subject: [PATCH 55/67] tests(search): Remove test class for update index command --- cl/search/tests/tests.py | 179 +-------------------------------------- 1 file changed, 3 insertions(+), 176 deletions(-) diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 6fc929a671..8eb99b38f2 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -33,13 +33,7 @@ from cl.lib.elasticsearch_utils import simplify_estimated_count from cl.lib.redis_utils import get_redis_interface from cl.lib.storage import clobbering_get_name -from cl.lib.test_helpers import ( - AudioTestCase, - CourtTestCase, - EmptySolrTestCase, - PeopleTestCase, - SolrTestCase, -) +from cl.lib.test_helpers import AudioTestCase, CourtTestCase, PeopleTestCase from cl.lib.utils import ( cleanup_main_query, get_child_court_ids_for_parents, @@ -92,11 +86,7 @@ SearchQuery, sort_cites, ) -from cl.search.tasks import ( - add_docket_to_solr_by_rds, - get_es_doc_id_and_parent_id, - index_dockets_in_bulk, -) +from cl.search.tasks import get_es_doc_id_and_parent_id, index_dockets_in_bulk from cl.search.types import EventTable from cl.tests.base import SELENIUM_TIMEOUT, BaseSeleniumTest from cl.tests.cases import ESIndexTestCase, TestCase @@ -104,110 +94,6 @@ from cl.users.factories import UserProfileWithParentsFactory -class UpdateIndexCommandTest(SolrTestCase): - args = [ - "--type", - "search.Opinion", - "--noinput", - ] - - def _get_result_count(self, results): - return results.result.numFound - - def test_updating_all_opinions(self) -> None: - """If we have items in the DB, can we add/delete them to/from Solr? - - This tests is rather long because we need to test adding and deleting, - and it's hard to setup/dismantle the indexes before/after every test. - """ - - # First, we add everything to Solr. - args = list(self.args) # Make a copy of the list. - args.extend( - [ - "--solr-url", - f"{settings.SOLR_HOST}/solr/{self.core_name_opinion}", - "--update", - "--everything", - "--do-commit", - ] - ) - call_command("cl_update_index", *args) - results = self.si_opinion.query("*").execute() - actual_count = self._get_result_count(results) - self.assertEqual( - actual_count, - self.expected_num_results_opinion, - msg="Did not get expected number of results.\n" - "\tGot:\t%s\n\tExpected:\t %s" - % ( - actual_count, - self.expected_num_results_opinion, - ), - ) - - # Check a simple citation query - results = self.si_opinion.query(cites=self.opinion_3.pk).execute() - actual_count = self._get_result_count(results) - expected_citation_count = 2 - self.assertEqual( - actual_count, - expected_citation_count, - msg="Did not get the expected number of citation counts.\n" - "\tGot:\t %s\n\tExpected:\t%s" - % (actual_count, expected_citation_count), - ) - - # Next, we delete everything from Solr - args = list(self.args) # Make a copy of the list. - args.extend( - [ - "--solr-url", - f"{settings.SOLR_HOST}/solr/{self.core_name_opinion}", - "--delete", - "--everything", - "--do-commit", - ] - ) - call_command("cl_update_index", *args) - results = self.si_opinion.query("*").execute() - actual_count = self._get_result_count(results) - expected_citation_count = 0 - self.assertEqual( - actual_count, - expected_citation_count, - msg="Did not get the expected number of counts in empty index.\n" - "\tGot:\t %s\n\tExpected:\t%s" - % (actual_count, expected_citation_count), - ) - - # Add things back, but do it by ID - args = list(self.args) # Make a copy of the list. - args.extend( - [ - "--solr-url", - f"{settings.SOLR_HOST}/solr/{self.core_name_opinion}", - "--update", - "--items", - f"{self.opinion_1.pk}", - f"{self.opinion_2.pk}", - f"{self.opinion_3.pk}", - "--do-commit", - ] - ) - call_command("cl_update_index", *args) - results = self.si_opinion.query("*").execute() - actual_count = self._get_result_count(results) - expected_citation_count = 3 - self.assertEqual( - actual_count, - expected_citation_count, - msg="Did not get the expected number of citation counts.\n" - "\tGot:\t %s\n\tExpected:\t%s" - % (actual_count, expected_citation_count), - ) - - class ModelTest(TestCase): fixtures = ["test_court.json"] @@ -250,7 +136,7 @@ def test_save_old_opinion(self, mock) -> None: cf = ContentFile(io.BytesIO(b"blah").read()) self.o.file_with_date = date(1899, 1, 1) self.o.local_path.save("file_name.pdf", cf, save=False) - self.o.save(index=False) + self.o.save() except ValueError: raise ValueError( "Unable to save a case older than 1900. Did you " @@ -500,65 +386,6 @@ def test_main_document_without_attachment_number(self): self.assertIsNotNone(document.id) -class IndexingTest(EmptySolrTestCase): - """Are things indexed properly?""" - - fixtures = ["test_court.json"] - - def test_issue_729_url_coalescing(self) -> None: - """Are URL's coalesced properly?""" - # Save a docket to the backend using coalescing - - test_dir = ( - Path(settings.INSTALL_ROOT) - / "cl" - / "assets" - / "media" - / "test" - / "search" - ) - self.att_filename = "fake_document.html" - fake_path = os.path.join(test_dir, self.att_filename) - - d = Docket.objects.create( - source=Docket.RECAP, - docket_number="asdf", - pacer_case_id="asdf", - court_id="test", - ) - de = DocketEntry.objects.create(docket=d, entry_number=1) - rd1 = RECAPDocument.objects.create( - docket_entry=de, - document_type=RECAPDocument.PACER_DOCUMENT, - document_number="1", - pacer_doc_id="1", - filepath_local=fake_path, - ) - rd2 = RECAPDocument.objects.create( - docket_entry=de, - document_type=RECAPDocument.ATTACHMENT, - document_number="1", - attachment_number=1, - pacer_doc_id="2", - filepath_local=fake_path, - ) - # Do the absolute URLs differ when pulled from the DB? - self.assertNotEqual(rd1.get_absolute_url(), rd2.get_absolute_url()) - - add_docket_to_solr_by_rds([rd1.pk, rd2.pk], force_commit=True) - - # Do the absolute URLs differ when pulled from Solr? - r1 = self.si_recap.get(rd1.pk) - r2 = self.si_recap.get(rd2.pk) - self.assertNotEqual( - r1.result.docs[0]["absolute_url"], - r2.result.docs[0]["absolute_url"], - ) - Docket.objects.all().delete() - DocketEntry.objects.all().delete() - RECAPDocument.objects.all().delete() - - class ESCommonSearchTest(ESIndexTestCase, TestCase): @classmethod def setUpTestData(cls): From 9a365dd29872db527aef12853c13e9f53521ae79 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:12:33 -0400 Subject: [PATCH 56/67] test(citations): Remove test class for parallel citation logic --- cl/citations/tests.py | 86 +------------------------------------------ 1 file changed, 1 insertion(+), 85 deletions(-) diff --git a/cl/citations/tests.py b/cl/citations/tests.py index df8387d972..313692b880 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -40,10 +40,6 @@ get_parenthetical_tokens, get_representative_parenthetical, ) -from cl.citations.management.commands.add_parallel_citations import ( - identify_parallel_citations, - make_edge_list, -) from cl.citations.match_citations import ( NO_MATCH_RESOURCE, do_resolve_citations, @@ -54,12 +50,7 @@ find_citations_and_parentheticals_for_opinion_by_pks, store_recap_citations, ) -from cl.lib.test_helpers import ( - CourtTestCase, - IndexedSolrTestCase, - PeopleTestCase, - SearchTestCase, -) +from cl.lib.test_helpers import CourtTestCase, PeopleTestCase, SearchTestCase from cl.search.factories import ( CitationWithParentsFactory, CourtFactory, @@ -1046,8 +1037,6 @@ def test_index_by_doc_id(self) -> None: args = [ "--doc-id", f"{self.opinion_id2}", - "--index", - "concurrently", ] self.call_command_and_test_it(args) @@ -1056,8 +1045,6 @@ def test_index_by_doc_ids(self) -> None: "--doc-id", f"{self.opinion_id3}", f"{self.opinion_id2}", - "--index", - "concurrently", ] self.call_command_and_test_it(args) @@ -1065,8 +1052,6 @@ def test_index_by_start_only(self) -> None: args = [ "--start-id", f"{min(self.opinion_id2, self.opinion_id3)}", - "--index", - "concurrently", ] self.call_command_and_test_it(args) @@ -1076,8 +1061,6 @@ def test_index_by_start_and_end(self) -> None: f"{min(self.opinion_id2, self.opinion_id3)}", "--end-id", f"{max(self.opinion_id2, self.opinion_id3)}", - "--index", - "concurrently", ] self.call_command_and_test_it(args) @@ -1085,77 +1068,10 @@ def test_filed_after(self) -> None: args = [ "--filed-after", f"{OpinionCluster.objects.get(pk=self.citation2.cluster_id).date_filed - timedelta(days=1)}", - "--index", - "concurrently", ] self.call_command_and_test_it(args) -class ParallelCitationTest(SimpleTestCase): - databases = "__all__" - - def test_identifying_parallel_citations(self) -> None: - """Given a string, can we identify parallel citations""" - tests = ( - # A pair consisting of a test string and the number of parallel - # citations that should be identifiable in that string. - # Simple case - ("1 U.S. 1 (22 U.S. 33)", 1, 2), - # Too far apart - ("1 U.S. 1 too many words 22 U.S. 33", 0, 0), - # Three citations - # ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3), - # Parallel citation after a valid citation too early on - ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2), - ) - for q, citation_group_count, expected_num_parallel_citations in tests: - with self.subTest( - f"Testing parallel citation identification for: {q}...", - q=q, - citation_group_count=citation_group_count, - expected_num_parallel_citations=expected_num_parallel_citations, - ): - citations = get_citations(q, tokenizer=HYPERSCAN_TOKENIZER) - citation_groups = identify_parallel_citations(citations) - computed_num_citation_groups = len(citation_groups) - self.assertEqual( - computed_num_citation_groups, - citation_group_count, - msg="Did not have correct number of citation groups. Got %s, " - "not %s." - % (computed_num_citation_groups, citation_group_count), - ) - if not citation_groups: - # Add an empty list to make testing easier. - citation_groups = [[]] - computed_num_parallel_citation = len(list(citation_groups)[0]) - self.assertEqual( - computed_num_parallel_citation, - expected_num_parallel_citations, - msg="Did not identify correct number of parallel citations in " - "the group. Got %s, not %s" - % ( - computed_num_parallel_citation, - expected_num_parallel_citations, - ), - ) - - def test_making_edge_list(self) -> None: - """Can we make network-friendly edge lists?""" - tests = [ - ([1, 2], [(1, 2)]), - ([1, 2, 3], [(1, 2), (2, 3)]), - ([1, 2, 3, 4], [(1, 2), (2, 3), (3, 4)]), - ] - for q, a in tests: - with self.subTest( - f"Testing network-friendly edge creation for: {q}...", - q=q, - a=a, - ): - self.assertEqual(make_edge_list(q), a) - - class FilterParentheticalTest(SimpleTestCase): def test_is_not_descriptive(self): fixtures = [ From 293727a2e49ed8246c1f85d64b2b1bec998798a6 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:13:44 -0400 Subject: [PATCH 57/67] test(lib): Remove base test classes for SOLR --- cl/lib/test_helpers.py | 102 ----------------------------------------- 1 file changed, 102 deletions(-) diff --git a/cl/lib/test_helpers.py b/cl/lib/test_helpers.py index c795f99d9e..04068a4601 100644 --- a/cl/lib/test_helpers.py +++ b/cl/lib/test_helpers.py @@ -3,15 +3,11 @@ from functools import wraps from typing import Sized, cast -import scorched -from django.conf import settings from django.contrib.auth.hashers import make_password from django.core.files.uploadedfile import SimpleUploadedFile from django.test.testcases import SerializeMixin -from django.test.utils import override_settings from django.utils import timezone from lxml import etree -from requests import Session from cl.audio.factories import AudioFactory from cl.audio.models import Audio @@ -44,13 +40,10 @@ ) from cl.search.models import ( Citation, - Court, Docket, - Opinion, OpinionsCitedByRECAPDocument, RECAPDocument, ) -from cl.search.tasks import add_items_to_solr from cl.tests.cases import SimpleTestCase, TestCase from cl.users.factories import UserProfileWithParentsFactory @@ -1289,101 +1282,6 @@ def setUpTestData(cls) -> None: super().setUpTestData() # type: ignore -@override_settings( - SOLR_OPINION_URL=settings.SOLR_OPINION_TEST_URL, - SOLR_AUDIO_URL=settings.SOLR_AUDIO_TEST_URL, - SOLR_PEOPLE_URL=settings.SOLR_PEOPLE_TEST_URL, - SOLR_RECAP_URL=settings.SOLR_RECAP_TEST_URL, - SOLR_URLS=settings.SOLR_TEST_URLS, - ELASTICSEARCH_DISABLED=True, -) -class EmptySolrTestCase(SerializeLockFileTestMixin, TestCase): - """Sets up an empty Solr index for tests that need to set up data manually. - - Other Solr test classes subclass this one, adding additional content or - features. - """ - - def setUp(self) -> None: - # Set up testing cores in Solr and swap them in - self.core_name_opinion = settings.SOLR_OPINION_TEST_CORE_NAME - self.core_name_audio = settings.SOLR_AUDIO_TEST_CORE_NAME - self.core_name_people = settings.SOLR_PEOPLE_TEST_CORE_NAME - self.core_name_recap = settings.SOLR_RECAP_TEST_CORE_NAME - - self.session = Session() - - self.si_opinion = scorched.SolrInterface( - settings.SOLR_OPINION_URL, http_connection=self.session, mode="rw" - ) - self.si_audio = scorched.SolrInterface( - settings.SOLR_AUDIO_URL, http_connection=self.session, mode="rw" - ) - self.si_people = scorched.SolrInterface( - settings.SOLR_PEOPLE_URL, http_connection=self.session, mode="rw" - ) - self.si_recap = scorched.SolrInterface( - settings.SOLR_RECAP_URL, http_connection=self.session, mode="rw" - ) - self.all_sis = [ - self.si_opinion, - self.si_audio, - self.si_people, - self.si_recap, - ] - - def tearDown(self) -> None: - try: - for si in self.all_sis: - si.delete_all() - si.commit() - finally: - self.session.close() - - -class SolrTestCase( - CourtTestCase, - PeopleTestCase, - SearchTestCase, - SimpleUserDataMixin, - EmptySolrTestCase, -): - """A standard Solr test case with content included in the database, but not - yet indexed into the database. - """ - - @classmethod - def setUpTestData(cls): - super().setUpTestData() - - def setUp(self) -> None: - # Set up some handy variables - super().setUp() - - self.court = Court.objects.get(pk="test") - self.expected_num_results_opinion = 6 - self.expected_num_results_audio = 2 - - -class IndexedSolrTestCase(SolrTestCase): - """Similar to the SolrTestCase, but the data is indexed in Solr""" - - def setUp(self) -> None: - super().setUp() - obj_types = { - "audio.Audio": Audio, - "search.Opinion": Opinion, - "people_db.Person": Person, - } - for obj_name, obj_type in obj_types.items(): - if obj_name == "people_db.Person": - items = obj_type.objects.filter(is_alias_of=None) - ids = [item.pk for item in items if item.is_judge] - else: - ids = obj_type.objects.all().values_list("pk", flat=True) - add_items_to_solr(ids, obj_name, force_commit=True) - - class SitemapTest(TestCase): sitemap_url: str expected_item_count: int From 88fd880665c8506c13b6a655ab6e638d805ad2ed Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:14:52 -0400 Subject: [PATCH 58/67] tests(recap): Remove old SOLR mocks --- cl/recap/tests.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/cl/recap/tests.py b/cl/recap/tests.py index fb3ddce827..0865e9b212 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -1385,8 +1385,7 @@ def test_debug_does_not_create_docket(self, add_atty_mock): self.assertEqual(DocketEntry.objects.count(), 0) self.assertEqual(RECAPDocument.objects.count(), 0) - @mock.patch("cl.recap.tasks.add_items_to_solr") - def test_debug_does_not_create_recap_documents(self, mock): + def test_debug_does_not_create_recap_documents(self): """If debug is passed, do we avoid creating recap documents?""" d = Docket.objects.create( source=0, court_id="scotus", pacer_case_id="asdf" @@ -1409,7 +1408,6 @@ def test_debug_does_not_create_recap_documents(self, mock): self.assertEqual(Docket.objects.count(), 1) self.assertEqual(DocketEntry.objects.count(), 1) self.assertEqual(RECAPDocument.objects.count(), 1) - mock.assert_not_called() class RecapPdfTaskTest(TestCase): @@ -2865,7 +2863,6 @@ def test_clean_up_docket_judge_fields_command( self.assertEqual(d_a.assigned_to_id, self.judge.pk) -@mock.patch("cl.recap.tasks.add_items_to_solr") class RecapDocketAttachmentTaskTest(TestCase): @classmethod def setUpTestData(cls): @@ -2893,7 +2890,7 @@ def tearDown(self) -> None: Docket.objects.all().delete() RECAPDocument.objects.all().delete() - def test_attachments_get_created(self, mock): + def test_attachments_get_created(self): """Do attachments get created if we have a RECAPDocument to match on?""" async_to_sync(process_recap_docket)(self.pq.pk) @@ -2913,7 +2910,6 @@ def test_attachments_get_created(self, mock): ) def test_main_document_doesnt_match_attachment_zero_on_creation( self, - mock_solr, mock_webhook_post, ): """Confirm that attachment 0 is properly set as the Main document if @@ -2973,7 +2969,6 @@ def test_main_document_doesnt_match_attachment_zero_on_creation( ) def test_main_document_doesnt_match_attachment_zero_existing( self, - mock_solr, mock_webhook_post, ): """Confirm that attachment 0 is properly set as the Main document if @@ -3058,7 +3053,6 @@ def test_main_document_doesnt_match_attachment_zero_existing( ) def test_main_rd_lookup_fallback_for_attachment_merging( self, - mock_solr, mock_webhook_post, ): """Confirm that attachment data can be properly merged when the current @@ -3285,7 +3279,6 @@ def test_criminal_data_gets_created(self) -> None: ) -@mock.patch("cl.recap.tasks.add_items_to_solr") class RecapAttachmentPageTaskTest(TestCase): def setUp(self) -> None: user = User.objects.get(username="recap") @@ -3319,7 +3312,7 @@ def tearDown(self) -> None: document_type=RECAPDocument.ATTACHMENT, ).delete() - def test_attachments_get_created(self, mock): + def test_attachments_get_created(self): """Do attachments get created if we have a RECAPDocument to match on?""" async_to_sync(process_recap_attachment)(self.pq.pk) @@ -3336,7 +3329,7 @@ def test_attachments_get_created(self, mock): self.assertEqual(self.pq.docket_id, self.d.pk) self.assertEqual(self.pq.docket_entry_id, self.de.pk) - def test_no_rd_match(self, mock): + def test_no_rd_match(self): """If there's no RECAPDocument to match on, do we fail gracefully?""" RECAPDocument.objects.all().delete() pq_status, msg, items = async_to_sync(process_recap_attachment)( From 05d233d77dbe0508e917a2fe4bb21e09055b9d2e Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 10:16:06 -0400 Subject: [PATCH 59/67] docs(alerts): Tweaks comment in send alert test --- cl/alerts/tests/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/alerts/tests/tests.py b/cl/alerts/tests/tests.py index 5d2f1acb5f..57817100a8 100644 --- a/cl/alerts/tests/tests.py +++ b/cl/alerts/tests/tests.py @@ -1164,7 +1164,7 @@ def test_send_search_alert_webhooks_rates(self): ): # Monthly alerts cannot be run on the 29th, 30th or 31st. with time_machine.travel(self.mock_date, tick=False): - # Send Solr Alerts (Except OA) + # Send Alerts (Except OA) call_command("cl_send_alerts", rate=rate) # Send ES Alerts (Only OA for now) call_command("cl_send_scheduled_alerts", rate=rate) From 1bd09b4e83c9f1bf2c5cdc9473fa741a3a732c4a Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 11:55:48 -0400 Subject: [PATCH 60/67] feat(corpus_importer): Removes import_columbia command --- .../import_columbia/populate_opinions.py | 588 ------------------ .../management/commands/import_columbia.py | 295 --------- 2 files changed, 883 deletions(-) delete mode 100644 cl/corpus_importer/import_columbia/populate_opinions.py delete mode 100644 cl/corpus_importer/management/commands/import_columbia.py diff --git a/cl/corpus_importer/import_columbia/populate_opinions.py b/cl/corpus_importer/import_columbia/populate_opinions.py deleted file mode 100644 index 86721b3e77..0000000000 --- a/cl/corpus_importer/import_columbia/populate_opinions.py +++ /dev/null @@ -1,588 +0,0 @@ -import calendar -import re -import string -from collections import OrderedDict -from datetime import date - -from asgiref.sync import async_to_sync -from django.conf import settings -from eyecite.find import get_citations -from eyecite.tokenizers import HyperscanTokenizer -from eyecite.utils import clean_text - -from cl.lib.scorched_utils import ExtraSolrInterface -from cl.lib.solr_core_admin import get_term_frequency -from cl.search.models import SOURCES, Docket, Opinion, OpinionCluster - -from ...people_db.lookup_utils import ( - lookup_judge_by_last_name, - lookup_judges_by_last_name_list, -) -from .convert_columbia_html import convert_columbia_html - -HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") - -# only make a solr connection once -SOLR_CONN = ExtraSolrInterface(settings.SOLR_OPINION_URL, mode="r") - - -# used to identify dates -# the order of these dates matters, as if there are multiple matches in an -# opinion for one type of date tag, the date associated to the --last-- matched -# tag will be the ones used for that type of date -FILED_TAGS = [ - "filed", - "opinion filed", - "date", - "order filed", - "delivered and filed", - "letter filed", - "dated", - "release date", - "filing date", - "filed date", - "date submitted", - "as of", - "opinions filed", - "filed on", - "decision filed", -] -DECIDED_TAGS = ["decided", "date decided", "decided on", "decided date"] -ARGUED_TAGS = [ - "argued", - "submitted", - "submitted on briefs", - "on briefs", - "heard", - "considered on briefs", - "argued and submitted", - "opinion", - "opinions delivered", - "opinion delivered", - "assigned on briefs", - "opinion issued", - "delivered", - "rendered", - "considered on briefs on", - "opinion delivered and filed", - "orally argued", - "rendered on", - "oral argument", - "submitted on record and briefs", -] -REARGUE_DENIED_TAGS = [ - "reargument denied", - "rehearing denied", - "further rehearing denied", - "as modified on denial of rehearing", - "order denying rehearing", - "petition for rehearing filed", - "motion for rehearing filed", - "rehearing denied to bar commission", - "reconsideration denied", - "denied", - "review denied", - "motion for rehearing and/or transfer to supreme court denied", - "motion for reargument denied", - "petition and crosspetition for review denied", - "opinion modified and as modified rehearing denied", - "motion for rehearing andor transfer to supreme court denied", - "petition for rehearing denied", - "leave to appeal denied", - "rehearings denied", - "motion for rehearing denied", - "second rehearing denied", - "petition for review denied", - "appeal dismissed", - "rehearing en banc denied", - "rehearing and rehearing en banc denied", - "order denying petition for rehearing", - "all petitions for review denied", - "petition for allowance of appeal denied", - "opinion modified and rehearing denied", - "as amended on denial of rehearing", - "reh denied", -] -REARGUE_TAGS = ["reargued", "reheard", "upon rehearing", "on rehearing"] -CERT_GRANTED_TAGS = [ - "certiorari granted", - "petition and crosspetition for writ of certiorari granted", -] -CERT_DENIED_TAGS = [ - "certiorari denied", - "certiorari quashed", - "certiorari denied by supreme court", - "petition for certiorari denied by supreme court", -] -UNKNOWN_TAGS = [ - "petition for review allowed", - "affirmed", - "reversed and remanded", - "rehearing overruled", - "review granted", - "decision released", - "transfer denied", - "released for publication", - "application to transfer denied", - "amended", - "reversed", - "opinion on petition to rehear", - "suggestion of error overruled", - "cv", - "case stored in record room", - "met to file petition for review disposed granted", - "rehearing granted", - "opinion released", - "permission to appeal denied by supreme court", - "rehearing pending", - "application for transfer denied", - "effective date", - "modified", - "opinion modified", - "transfer granted", - "discretionary review denied", - "application for leave to file second petition for rehearing denied", - "final", - "date of judgment entry on appeal", - "petition for review pending", - "writ denied", - "rehearing filed", - "as extended", - "officially released", - "appendix filed", - "spring sessions", - "summer sessions", - "fall sessions", - "winter sessions", - "discretionary review denied by supreme court", - "dissenting opinion", - "en banc reconsideration denied", - "answer returned", - "refiled", - "revised", - "modified upon denial of rehearing", - "session mailed", - "reversed and remanded with instructions", - "writ granted", - "date of judgment entry", - "preliminary ruling rendered", - "amended on", - "dissenting opinion filed", - "concurring opinion filed", - "memorandum dated", - "mandamus denied on mandate", - "updated", - "date of judgment entered", - "released and journalized", - "submitted on", - "case assigned", - "opinion circulated for comment", - "submitted on rehearing", - "united states supreme court dismissed appeal", - "answered", - "reconsideration granted in part and as amended", - "as amended on denial of rehearing", - "reassigned", - "as amended", - "as corrected", - "writ allowed", - "released", - "application for leave to appeal filed", - "affirmed on appeal reversed and remanded", - "as corrected", - "withdrawn substituted and refiled", - "answered", - "released", - "as modified and ordered published", - "remanded", - "concurring opinion added", - "decision and journal entry dated", - "memorandum filed", - "as modified", -] - -# used to check if a docket number appears in what should be a citation string -# the order matters, as these are stripped from a docket string in order -DOCKET_JUNK = [ - "c.a. no. kc", - "c.a. no. pm", - "c.a. no.", - "i.c. no.", - "case no.", - "no.", -] - -# known abbreviations that indicate if a citation isn't actually a citation -BAD_CITES = ["Iowa App.", "R.I.Super.", "Ma.Super.", "Minn.App.", "NCIC"] - -# used to figure out if a "citation text" is really a citation -TRIVIAL_CITE_WORDS = ( - [n.lower() for n in calendar.month_name] - + [n.lower()[:3] for n in calendar.month_name] - + ["no"] -) - -# used to map the parsed opinion types to their tags in the populated opinion -# objects -OPINION_TYPE_MAPPING = { - "opinion": Opinion.LEAD, - "dissent": Opinion.DISSENT, - "concurrence": Opinion.CONCUR_IN_PART, -} - - -def make_and_save( - item, skipdupes=False, min_dates=None, start_dates=None, testing=True -): - """Associates case data from `parse_opinions` with objects. Saves these - objects. - - min_date: if not none, will skip cases after min_date - """ - date_filed = date_argued = date_reargued = date_reargument_denied = ( - date_cert_granted - ) = date_cert_denied = None - unknown_date = None - for date_cluster in item["dates"]: - for date_info in date_cluster: - # check for any dates that clearly aren't dates - if date_info[1].year < 1600 or date_info[1].year > 2020: - continue - # check for untagged dates that will be assigned to date_filed - if date_info[0] is None: - date_filed = date_info[1] - continue - # try to figure out what type of date it is based on its tag string - if date_info[0] in FILED_TAGS: - date_filed = date_info[1] - elif date_info[0] in DECIDED_TAGS: - if not date_filed: - date_filed = date_info[1] - elif date_info[0] in ARGUED_TAGS: - date_argued = date_info[1] - elif date_info[0] in REARGUE_TAGS: - date_reargued = date_info[1] - elif date_info[0] in REARGUE_DENIED_TAGS: - date_reargument_denied = date_info[1] - elif date_info[0] in CERT_GRANTED_TAGS: - date_cert_granted = date_info[1] - elif date_info[0] in CERT_DENIED_TAGS: - date_cert_denied = date_info[1] - else: - unknown_date = date_info[1] - if date_info[0] not in UNKNOWN_TAGS: - print( - "\nFound unknown date tag '%s' with date '%s'.\n" - % date_info - ) - - # the main date (used for date_filed in OpinionCluster) and panel dates - # (used for finding judges) are ordered in terms of which type of dates - # best reflect them - main_date = ( - date_filed - or date_argued - or date_reargued - or date_reargument_denied - or unknown_date - ) - panel_date = ( - date_argued - or date_reargued - or date_reargument_denied - or date_filed - or unknown_date - ) - - if main_date is None: - raise Exception(f"Failed to get a date for {item['file']}") - - # special rule for Kentucky - if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31): - item["court_id"] = "kycourtapphigh" - - if min_dates is not None: - if min_dates.get(item["court_id"]) is not None: - if main_date >= min_dates[item["court_id"]]: - print( - main_date, - "after", - min_dates[item["court_id"]], - " -- skipping.", - ) - return - if start_dates is not None: - if start_dates.get(item["court_id"]) is not None: - if main_date <= start_dates[item["court_id"]]: - print( - main_date, - "before court founding:", - start_dates[item["court_id"]], - " -- skipping.", - ) - return - - docket = Docket( - source=Docket.COLUMBIA, - date_argued=date_argued, - date_reargued=date_reargued, - date_cert_granted=date_cert_granted, - date_cert_denied=date_cert_denied, - date_reargument_denied=date_reargument_denied, - court_id=item["court_id"], - case_name_short=item["case_name_short"] or "", - case_name=item["case_name"] or "", - case_name_full=item["case_name_full"] or "", - docket_number=item["docket"] or "", - ) - - # get citation objects in a list for addition to the cluster - found_citations = [] - for c in item["citations"]: - found = get_citations( - clean_text(c, ["html", "inline_whitespace"]), - tokenizer=HYPERSCAN_TOKENIZER, - ) - if not found: - # if the docket number --is-- citation string, we're likely dealing - # with a somewhat common triplet of (docket number, date, - # jurisdiction), which isn't a citation at all (so there's no - # problem) - if item["docket"]: - docket_no = item["docket"].lower() - if "claim no." in docket_no: - docket_no = docket_no.split("claim no.")[0] - for junk in DOCKET_JUNK: - docket_no = docket_no.replace(junk, "") - docket_no = docket_no.strip(".").strip() - if docket_no and docket_no in c.lower(): - continue - - # there are a trivial number of letters (except for - # months and a few trivial words) in the citation, - # then it's not a citation at all - non_trivial = c.lower() - for trivial in TRIVIAL_CITE_WORDS: - non_trivial = non_trivial.replace(trivial, "") - num_letters = sum( - non_trivial.count(letter) for letter in string.lowercase - ) - if num_letters < 3: - continue - - # if there is a string that's known to indicate - # a bad citation, then it's not a citation - if any(bad in c for bad in BAD_CITES): - continue - # otherwise, this is a problem - raise Exception( - "Failed to get a citation from the string '%s' in " - "court '%s' with docket '%s'." - % (c, item["court_id"], item["docket"]) - ) - else: - found_citations.extend(found.to_model()) - - cluster = OpinionCluster( - judges=item.get("judges", "") or "", - precedential_status=( - "Unpublished" if item["unpublished"] else "Published" - ), - date_filed=main_date, - case_name_short=item["case_name_short"] or "", - case_name=item["case_name"] or "", - case_name_full=item["case_name_full"] or "", - source=SOURCES.COLUMBIA_ARCHIVE, - attorneys=item["attorneys"] or "", - posture=item["posture"] or "", - ) - panel = async_to_sync(lookup_judges_by_last_name_list)( - item["panel"], item["court_id"], panel_date - ) - - opinions = [] - for i, opinion_info in enumerate(item["opinions"]): - if opinion_info["author"] is None: - author = None - else: - author = async_to_sync(lookup_judge_by_last_name)( - opinion_info["author"], item["court_id"], panel_date - ) - - converted_text = convert_columbia_html(opinion_info["opinion"]) - opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]] - if opinion_type == Opinion.LEAD and i > 0: - opinion_type = Opinion.ADDENDUM - - opinion = Opinion( - author=author, - per_curiam=opinion_info["per_curiam"], - type=opinion_type, - # type=OPINION_TYPE_MAPPING[opinion_info['type']], - html_columbia=converted_text, - sha1=opinion_info["sha1"], - # This is surely not updated for the new S3 world. If you're - # reading this, you'll need to update this code. - local_path=opinion_info["local_path"], - ) - joined_by = async_to_sync(lookup_judges_by_last_name_list)( - item["joining"], item["court_id"], panel_date - ) - opinions.append((opinion, joined_by)) - - if min_dates is None: - # check to see if this is a duplicate - dups = find_dups(docket, cluster) - if dups: - if skipdupes: - print("Duplicate. skipping.") - else: - raise Exception(f"Found {len(dups)} duplicate(s).") - - # save all the objects - if not testing: - try: - docket.save() - cluster.docket = docket - cluster.save() - for citation in found_citations: - citation.cluster = cluster - citation.save() - for member in panel: - cluster.panel.add(member) - for opinion, joined_by in opinions: - opinion.cluster = cluster - opinion.save() - for joiner in joined_by: - opinion.joined_by.add(joiner) - if settings.DEBUG: - domain = "http://127.0.0.1:8000" - else: - domain = "https://www.courtlistener.com" - print(f"Created item at: {domain}{cluster.get_absolute_url()}") - except: - # if anything goes wrong, try to delete everything - try: - docket.delete() - except: - pass - raise - - -def find_dups(docket, cluster): - """Finds the duplicate cases associated to a collection of objects. - - :param docket: A `Docket` instance. - :param cluster: An `OpinionCluster` instance. - """ - if not cluster.citations.exists(): - # if there aren't any citations, assume - # for now that there's no duplicate - return [] - - params = { - "fq": [ - f"court_id:{docket.court_id}", - "citation:(%s)" - % " OR ".join(f'"{c}"~5' for c in cluster.citations.all() if c), - ], - "rows": 100, - "caller": "corpus_importer.import_columbia.populate_opinions", - } - results = SOLR_CONN.query().add_extra(**params).execute() - if len(results) == 1: - # found the duplicate - return results - elif len(results) > 1: - # narrow down the cases that match citations - remaining = [] - base_words = get_case_name_words(docket.case_name) - for r in results: - # if the important words in case names don't match up, these aren't - # duplicates - if not r.get("caseName"): - continue - if get_case_name_words(r["caseName"]) == base_words: - remaining.append(r) - if remaining: - # we successfully narrowed down the results - return remaining - # failed to narrow down results, so we just return the cases that match - # citations - return results - return [] - - -def get_case_name_words(case_name): - """Gets all the important words in a case name. Returns them as a set.""" - case_name = case_name.lower() - filtered_words = [] - all_words = case_name.split() - if " v. " in case_name: - v_index = all_words.index("v.") - # The first word of the defendant and the last word in the plaintiff - # that's not a bad word. - plaintiff_a = get_good_words(all_words[:v_index]) - defendant_a = get_good_words(all_words[v_index + 1 :]) - if plaintiff_a: - filtered_words.append(plaintiff_a[-1]) - if defendant_a: - # append the first good word that's not already in the array - try: - filtered_words.append( - [ - word - for word in defendant_a - if word not in filtered_words - ][0] - ) - except IndexError: - # When no good words left in defendant_a - pass - elif ( - "in re " in case_name - or "matter of " in case_name - or "ex parte" in case_name - ): - try: - subject = re.search( - "(?:(?:in re)|(?:matter of)|(?:ex parte)) (.*)", case_name - ).group(1) - except TypeError: - subject = "" - good_words = get_good_words(subject.split()) - if good_words: - filtered_words.append(good_words[0]) - else: - filtered_words = get_good_words(all_words) - return set(filtered_words) - - -def get_good_words(word_list, stop_words_size=500): - """Cleans out stop words, abbreviations, etc. from a list of words""" - stopwords = StopWords().stop_words - good_words = [] - for word in word_list: - # Clean things up - word = re.sub(r"'s", "", word) - word = word.strip('*,();"') - - # Boolean conditions - stop = word in stopwords[:stop_words_size] - bad_stuff = re.search("[0-9./()!:&']", word) - too_short = len(word) <= 1 - is_acronym = word.isupper() and len(word) <= 3 - if any([stop, bad_stuff, too_short, is_acronym]): - continue - else: - good_words.append(word) - # Eliminate dups, but keep order. - return list(OrderedDict.fromkeys(good_words)) - - -class StopWords: - """A very simple object that can hold stopwords, but that is only - initialized once. - """ - - stop_words = get_term_frequency(result_type="list") diff --git a/cl/corpus_importer/management/commands/import_columbia.py b/cl/corpus_importer/management/commands/import_columbia.py deleted file mode 100644 index 7338086dd9..0000000000 --- a/cl/corpus_importer/management/commands/import_columbia.py +++ /dev/null @@ -1,295 +0,0 @@ -import fnmatch -import os -import traceback -from glob import glob -from random import shuffle - -from cl.corpus_importer.import_columbia.parse_opinions import parse_file -from cl.corpus_importer.import_columbia.populate_opinions import make_and_save -from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.import_lib import ( - get_courtdates, - get_min_dates, - get_min_nocite, - get_path_list, -) - - -class Command(VerboseCommand): - help = ( - "Parses the xml files in the specified directory into opinion " - "objects that are saved." - ) - - def add_arguments(self, parser): - parser.add_argument( - "dir", - nargs="+", - type=str, - help="The directory that will be recursively searched for xml " - "files.", - ) - parser.add_argument( - "--limit", - type=int, - default=None, - help="Limit on how many files to run through. By default will run " - "through all (or if `--random`, forever).", - ) - parser.add_argument( - "--random", - action="store_true", - default=False, - help="If set, will run through the directories and files in random " - "order.", - ) - parser.add_argument( - "--status", - type=int, - default=100, - help="How often a status update will be given. By default, every " - "100 files.", - ) - parser.add_argument( - "--newcases", - action="store_true", - default=False, - help="If set, will skip court-years that already have data.", - ) - parser.add_argument( - "--skipdupes", - action="store_true", - default=False, - help="If set, will skip duplicates.", - ) - parser.add_argument( - "--skipnewcases", - action="store_true", - default=False, - help="If set, will skip cases from initial columbia import.", - ) - parser.add_argument( - "--avoid_nocites", - action="store_true", - default=False, - help="If set, will not import dates after the earliest case without a citation.", - ) - parser.add_argument( - "--courtdates", - action="store_true", - default=False, - help="If set, will throw exception for cases before court was founded.", - ) - parser.add_argument( - "--startfolder", - type=str, - default=None, - help="The folder (state name) to start on.", - ) - parser.add_argument( - "--startfile", - type=str, - default=None, - help="The file name to start on (if resuming).", - ) - parser.add_argument( - "--debug", - action="store_true", - default=False, - help="Don't change the data.", - ) - - def handle(self, *args, **options): - super().handle(*args, **options) - do_many( - options["dir"][0], - options["limit"], - options["random"], - options["status"], - options["newcases"], - options["skipdupes"], - options["skipnewcases"], - options["avoid_nocites"], - options["courtdates"], - options["startfolder"], - options["startfile"], - options["debug"], - ) - - -def do_many( - dir_path, - limit, - random_order, - status_interval, - newcases, - skipdupes, - skip_newcases, - avoid_nocites, - courtdates, - startfolder, - startfile, - debug, -): - """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. - Parses each .xml document, instantiates the associated model object, and - saves the object. Prints/logs status updates and tracebacks instead of - raising exceptions. - - :param dir_path: The directory. - :param limit: A limit on how many files to run through. If None, will run - through all (or if random order, forever). - :param random_order: If true, will run through the directories and files in - random order. - :param status_interval: How often a status update will be given. - :param newcases: If true, skip court-years that already have data. - :param skipdupes: If true, skip duplicates. - :param skip_newcases: If true, skip cases imported under newcases. - :param avoid_nocites: If true, skip cases from dates after any case with no cite. - :param courtdates: If true, skip cases with dates before court established. - :param startfolder: If not None, start on startfolder - :param startfile: If not None, start on this file (for resuming) - """ - if limit: - total = limit - elif not random_order: - logger.info("Getting an initial file count...") - total = 0 - for _, _, file_names in os.walk(dir_path): - total += len(fnmatch.filter(file_names, "*.xml")) - else: - total = None - # go through the files, yielding parsed files and printing status updates as - # we go - folders = glob(f"{dir_path}/*") - folders.sort() - count = 0 - - # get earliest dates for each court - if newcases: - logger.info("Only new cases: getting earliest dates by court.") - min_dates = get_min_dates() - else: - min_dates = None - - if avoid_nocites: - if newcases: - raise Exception( - "Cannot use both avoid_nocites and newcases options." - ) - logger.info( - "Avoiding no cites: getting earliest dates by court with " - "no citation." - ) - min_dates = get_min_nocite() - - if courtdates: - start_dates = get_courtdates() - else: - start_dates = None - - # check if skipping first columbias cases - - if skip_newcases: - skiplist = get_path_list() - else: - skiplist = set() - - # start/resume functionality - if startfolder is not None: - skipfolder = True - else: - skipfolder = False - if startfile is not None: - skipfile = True - else: - skipfile = False - - for folder in folders: - if skipfolder: - if startfolder is not None: - checkfolder = folder.split("/")[-1] - if checkfolder == startfolder: - skipfolder = False - else: - continue - logger.debug(folder) - - for path in file_generator(folder, random_order, limit): - if skipfile: - if startfile is not None: - checkfile = path.split("/")[-1] - if checkfile == startfile: - skipfile = False - else: - continue - - if path in skiplist: - continue - - # skip cases in 'misc*' folders -- they are relatively different - # than the other cases, so we'll deal with them later - if "miscellaneous_court_opinions" in path: - continue - - logger.debug(path) - - # try to parse/save the case and show any exceptions with full - # tracebacks - try: - parsed = parse_file(path) - make_and_save(parsed, skipdupes, min_dates, start_dates, debug) - except Exception as e: - logger.info(path) - # show simple exception summaries for known problems - known = [ - "mismatched tag", - "Failed to get a citation", - "Failed to find a court ID", - 'null value in column "date_filed"', - "duplicate(s)", - ] - if any(k in str(e) for k in known): - logger.info(f"Known exception in file '{path}':") - logger.info(str(e)) - else: - logger.info(f"Unknown exception in file '{path}':") - logger.info(traceback.format_exc()) - # status update - count += 1 - if count % status_interval == 0: - if total: - logger.info(f"Finished {count} out of {total} files.") - else: - logger.info(f"Finished {count} files.") - - -def file_generator(dir_path, random_order=False, limit=None): - """Generates full file paths to all xml files in `dir_path`. - - :param dir_path: The path to get files from. - :param random_order: If True, will generate file names randomly (possibly - with repeats) and will never stop generating file names. - :param limit: If not None, will limit the number of files generated to this - integer. - """ - count = 0 - if not random_order: - for root, dir_names, file_names in os.walk(dir_path): - file_names.sort() - for file_name in fnmatch.filter(file_names, "*.xml"): - yield os.path.join(root, file_name).replace("\\", "/") - count += 1 - if count == limit: - return - else: - for root, dir_names, file_names in os.walk(dir_path): - shuffle(dir_names) - names = fnmatch.filter(file_names, "*.xml") - if names: - shuffle(names) - yield os.path.join(root, names[0]).replace("\\", "/") - break - count += 1 - if count == limit: - return From b80f0762ea73f010af19ae4ce9a86decc943a693 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 13:27:42 -0400 Subject: [PATCH 61/67] Refactor(search): Removes unused imports --- cl/search/tasks.py | 10 ++-------- cl/search/tests/tests.py | 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/cl/search/tasks.py b/cl/search/tasks.py index bffc72e119..602ac95700 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -1,6 +1,5 @@ import logging -import socket -from datetime import date, timedelta +from datetime import date from importlib import import_module from random import randint from typing import Any, Generator @@ -11,7 +10,6 @@ from django.conf import settings from django.core.exceptions import ObjectDoesNotExist from django.db.models import Prefetch, QuerySet -from django.utils.timezone import now from elasticsearch.exceptions import ( ApiError, ConflictError, @@ -27,7 +25,6 @@ streaming_bulk, ) from elasticsearch_dsl import Document, Q, UpdateByQuery, connections -from requests import Session from cl.alerts.tasks import ( percolator_response_processing, @@ -36,10 +33,7 @@ from cl.audio.models import Audio from cl.celery_init import app from cl.lib.elasticsearch_utils import build_daterange_query -from cl.lib.search_index_utils import ( - InvalidDocumentError, - get_parties_from_case_name, -) +from cl.lib.search_index_utils import get_parties_from_case_name from cl.people_db.models import Person, Position from cl.search.documents import ( ES_CHILD_ID, diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 8eb99b38f2..8b9e6d3a2e 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -1,9 +1,7 @@ import datetime import io -import os from datetime import date from http import HTTPStatus -from pathlib import Path from unittest import mock from urllib.parse import parse_qs From 76e37f1f6fe58ae3a3ac7da1e534f4ef83fca90e Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 13:27:58 -0400 Subject: [PATCH 62/67] refactor(audio): Removes unused imports --- cl/audio/models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/audio/models.py b/cl/audio/models.py index 7ec48d8a5b..8eb86393ba 100644 --- a/cl/audio/models.py +++ b/cl/audio/models.py @@ -1,5 +1,3 @@ -from typing import Dict, List, Union - import pghistory from django.db import models from django.urls import reverse From f0b2af7904d3891034a55f602311a35ad0472f12 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 13:28:14 -0400 Subject: [PATCH 63/67] refactor(people_db): Removes unused imports --- cl/people_db/admin.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cl/people_db/admin.py b/cl/people_db/admin.py index 411a4cb8f9..cea7f57609 100644 --- a/cl/people_db/admin.py +++ b/cl/people_db/admin.py @@ -1,8 +1,5 @@ from admin_cursor_paginator import CursorPaginatorAdmin from django.contrib import admin -from django.db.models import QuerySet -from django.forms import ModelForm -from django.http import HttpRequest from cl.lib.admin import AdminTweaksMixin, NotesInline From 053307a0073816f51917ca5224d7d4f038716157 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 13:28:24 -0400 Subject: [PATCH 64/67] refactor(citations): Removes unused imports --- cl/citations/match_citations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/citations/match_citations.py b/cl/citations/match_citations.py index 84570e405d..11f9a3fe0a 100644 --- a/cl/citations/match_citations.py +++ b/cl/citations/match_citations.py @@ -1,6 +1,4 @@ #!/usr/bin/env python - -from datetime import datetime from typing import Dict, Iterable, List, Optional, no_type_check from elasticsearch_dsl.response import Hit From aacd061a12a86dad7638de377d8fac8d4dbca129 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 21:25:56 -0400 Subject: [PATCH 65/67] feat(search): Removes SOLR templates to index records --- cl/search/templates/indexes/audio_text.txt | 29 ---------- cl/search/templates/indexes/dockets_text.txt | 58 -------------------- cl/search/templates/indexes/opinion_text.txt | 51 ----------------- cl/search/templates/indexes/person_text.txt | 36 ------------ 4 files changed, 174 deletions(-) delete mode 100644 cl/search/templates/indexes/audio_text.txt delete mode 100644 cl/search/templates/indexes/dockets_text.txt delete mode 100644 cl/search/templates/indexes/opinion_text.txt delete mode 100644 cl/search/templates/indexes/person_text.txt diff --git a/cl/search/templates/indexes/audio_text.txt b/cl/search/templates/indexes/audio_text.txt deleted file mode 100644 index c76c172459..0000000000 --- a/cl/search/templates/indexes/audio_text.txt +++ /dev/null @@ -1,29 +0,0 @@ -{# Audio file #} -{% if item.case_name_full %} - {{ item.case_name_full }} -{% elif item.case_name %} - {{ item.case_name }} -{% else %} - {{ item.case_name_short }} -{% endif %} - -{# Docket #} -{# Need dates so queries for the date are sure to be returned (see #271) #} -{{ item.docket.date_argued|date:"j F Y" }} -{{ item.docket.date_reargued|date:"j F Y" }} -{{ item.docket.date_reargument_denied|date:"j F Y" }} -{{ item.docket.docket_number }} - -{# Transcript #} -{% if item.stt_status == 1 %} - {{ item.transcript }} -{% endif %} - -{# Court #} -{{ item.docket.court.full_name }} -{{ item.docket.court.citation_string }} -{{ item.docket.court.pk }} - -{# Remainder of Audio File #} -{{ item.sha1 }} -{{ item.judges }} diff --git a/cl/search/templates/indexes/dockets_text.txt b/cl/search/templates/indexes/dockets_text.txt deleted file mode 100644 index dc11384c19..0000000000 --- a/cl/search/templates/indexes/dockets_text.txt +++ /dev/null @@ -1,58 +0,0 @@ -{# DocketEntry #} -{% with entry=item.docket_entry %} - {{ entry.description|safe }} - {{ entry.date_filed|date:"j F Y" }} -{% endwith %} - - -{# RECAPDocument #} -{{ item.get_document_type_display }} -{{ item.plain_text }} - - -{# Docket #} -{% with docket=item.docket_entry.docket %} - {% if docket.case_name_full %} - {{ docket.case_name_full }} - {% elif docket.case_name %} - {{ docket.case_name }} - {% else %} - {{ docket.case_name_short }} - {% endif %} - {{ docket.date_argued|date:"j F Y" }} - {{ docket.date_filed|date:"j F Y" }} - {{ docket.date_terminated|date:"j F Y" }} - {{ docket.docket_number }} - {{ docket.nature_of_suit }} - {{ docket.jury_demand }} -{% endwith %} - - -{# Court #} -{% with court=item.docket_entry.docket.court %} - {{ court.full_name }} - {{ court.citation_string }} - {{ court.pk }} -{% endwith %} - - -{# Judges #} -{% with assigned_to=item.docket_entry.docket.assigned_to %} - {% if assigned_to %} - {{ assigned_to.name_full }} - {% endif %} -{% endwith %} -{% with referred_to=item.docket_entry.docket.referred_to %} - {% if referred_to %} - {{ referred_to.name_full }} - {% endif %} -{% endwith %} - - -{# bankruptcy info: Skip the dates, but add the chapter and trustee #} -{% with bankr_info=item.docket_entry.docket.bankruptcy_information %} - {% if bankr_info.chapter %} - Chapter: {{ bankr_info.chapter }} - {% endif %} - {{ bankr_info.trustee_str }} -{% endwith %} diff --git a/cl/search/templates/indexes/opinion_text.txt b/cl/search/templates/indexes/opinion_text.txt deleted file mode 100644 index 73f4636574..0000000000 --- a/cl/search/templates/indexes/opinion_text.txt +++ /dev/null @@ -1,51 +0,0 @@ -{# The body of the item (columbia > lawbox > html > plaintext) #} -{% load text_filters %} -{% if item.html_columbia %} - {{ item.html_columbia|striptags|html_decode }} -{% elif item.html_lawbox %} - {{ item.html_lawbox|striptags|html_decode }} -{% elif item.xml_harvard %} - {{ item.xml_harvard|striptags|html_decode }} -{% elif item.html %} - {{ item.html|striptags|html_decode }} -{% else %} - {{ item.plain_text }} -{% endif %} - -{# Docket #} -{# Need dates so queries for the date are sure to be returned (see #271) #} -{{ item.cluster.docket.date_argued|date:"j F Y" }} -{{ item.cluster.docket.date_reargued|date:"j F Y" }} -{{ item.cluster.docket.date_reargument_denied|date:"j F Y" }} -{{ item.cluster.docket.docket_number }} - -{# Court #} -{{ item.cluster.docket.court.full_name }} -{{ item.cluster.docket.court.pk }} -{{ item.cluster.docket.court.citation_string }} - -{# Cluster #} -{% if item.cluster.case_name_full %} - {{ item.cluster.case_name_full }} -{% elif item.cluster.case_name %} - {{ item.cluster.case_name }} -{% else %} - {{ item.cluster.case_name_short }} -{% endif %} -{% for judge in item.cluster.panel.all %} - {{ judge.name_full }} -{% endfor %} -{{ item.cluster.judges }} -{{ item.cluster.date_filed|date:"j F Y" }} -{{ citation_string }} -{{ item.cluster.procedural_history }} -{{ item.cluster.attorneys }} -{{ item.cluster.nature_of_suit }} -{{ item.cluster.posture }} -{{ item.cluster.syllabus }} -{{ item.cluster.precedential_status }} - -{# Opinion #} -{{ item.sha1 }} - -{# HTML fields would go here, but they must be first, since they're displayed when no query #} diff --git a/cl/search/templates/indexes/person_text.txt b/cl/search/templates/indexes/person_text.txt deleted file mode 100644 index 7051dbb8f4..0000000000 --- a/cl/search/templates/indexes/person_text.txt +++ /dev/null @@ -1,36 +0,0 @@ -{# Person #} -{{ item.name_full }} -{% for alias in item.aliases.all %} - {{ alias.name_full }} -{% endfor %} - -{{ item.dob_city }} -{{ item.get_dob_state_display }} -{% for p in item.positions.all %} - {{ p.get_position_type_display }} - {{ p.get_nomination_process_display }} - {{ p.get_judicial_committee_action_display }} - {{ p.get_how_selected_display }} - {{ p.get_termination_reason_display }} - {{ p.court.full_name }} - {{ p.court.citation_string }} - {{ p.court.pk }} - {{ p.organization_name }} - {{ p.job_title }} -{% endfor %} - -{% for pa in item.political_affiliations.all %} - {{ pa.get_political_party_display }} -{% endfor %} - -{% for e in item.educations.all %} - {{ e.school.name }} -{% endfor %} - -{% for aba in item.aba_ratings.all %} - {{ aba.get_rating_display }} -{% endfor %} - -{{ item.fjc_id }} -{{ item.get_gender_display }} -{{ item.religion }} From 9e0e645636777dd01f919cd0030a41e8cf5f3743 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 21:29:00 -0400 Subject: [PATCH 66/67] docs(citations): Refines comment in store_opinion_citations_and_update_parentheticals --- cl/citations/tasks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py index dfd0d273c2..8ac2984752 100644 --- a/cl/citations/tasks.py +++ b/cl/citations/tasks.py @@ -206,8 +206,7 @@ def store_opinion_citations_and_update_parentheticals( ) # Finally, commit these changes to the database in a single - # transcation block. Trigger a single update as well, if - # required. + # transcation block. with transaction.atomic(): opinion_clusters_to_update = OpinionCluster.objects.filter( sub_opinions__pk__in=opinion_ids_to_update From 5530be006fbe0dc86061f6de5befc58e0c783b9a Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 10 Dec 2024 22:01:09 -0400 Subject: [PATCH 67/67] feat(stats): Adds ES health check --- cl/stats/utils.py | 33 +++++++++++++++++++++++++++++++-- cl/stats/views.py | 5 ++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/cl/stats/utils.py b/cl/stats/utils.py index 41dc88e0cf..b663f665a9 100644 --- a/cl/stats/utils.py +++ b/cl/stats/utils.py @@ -1,11 +1,15 @@ from collections import OrderedDict import redis -import requests -from django.conf import settings from django.db import OperationalError, connections from django.db.models import F from django.utils.timezone import now +from elasticsearch.exceptions import ( + ConnectionError, + ConnectionTimeout, + RequestError, +) +from elasticsearch_dsl import connections as es_connections from cl.lib.db_tools import fetchall_as_dict from cl.lib.redis_utils import get_redis_interface @@ -79,6 +83,31 @@ def check_redis() -> bool: return True +def check_elasticsearch() -> bool: + """ + Checks the health of the connected Elasticsearch cluster. + + it retrieves the cluster health information and returns: + + * True: if the cluster health status is "green" (healthy). + * False: if the cluster health is not "green" or an error occurs + during connection or health retrieval. + """ + try: + es = es_connections.get_connection() + cluster_health = es.cluster.health() + except ( + ConnectionError, + ConnectionTimeout, + RequestError, + ): + return False + + if cluster_health["status"] == "green": + return True + return False + + def check_postgresql() -> bool: """Just check if we can connect to postgresql""" try: diff --git a/cl/stats/views.py b/cl/stats/views.py index 3bdd8e344c..6e274d894e 100644 --- a/cl/stats/views.py +++ b/cl/stats/views.py @@ -6,6 +6,7 @@ from cl.celery_init import fail_task from cl.lib.redis_utils import get_redis_interface from cl.stats.utils import ( + check_elasticsearch, check_postgresql, check_redis, get_replication_statuses, @@ -16,15 +17,17 @@ def health_check(request: HttpRequest) -> JsonResponse: """Check if we can connect to various services.""" is_redis_up = check_redis() is_postgresql_up = check_postgresql() + is_elastic_up = check_elasticsearch() status = HTTPStatus.OK - if not all([is_redis_up, is_postgresql_up]): + if not all([is_redis_up, is_postgresql_up, is_elastic_up]): status = HTTPStatus.INTERNAL_SERVER_ERROR return JsonResponse( { "is_postgresql_up": is_postgresql_up, "is_redis_up": is_redis_up, + "is_elastic_up": is_elastic_up, }, status=status, )