Merge pull request #4703 from freelawproject/recap-document-validation

feat(RECAPDocument): Validate attachment number against document type
freelawproject · Nov 19, 2024 · 99c184a · 99c184a
2 parents 8c86927 + af417c9
commit 99c184a
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 12 deletions.
diff --git a/cl/api/templates/recap-api-docs-vlatest.html b/cl/api/templates/recap-api-docs-vlatest.html
@@ -289,7 +289,9 @@ <h4 id="recap-dockets">Uploading Dockets, History Reports, and Claims Registries
   <h4 id="recap-pdf">Uploading PDFs</h4>
   <p>To upload PDFs, include the <code>pacer_doc_id</code> and <code>document_number</code> fields. For documents originating from courts outside the new Appellate Case Management System (ACMS), the fourth digit of the <code>pacer_doc_id</code> must always be normalized to a zero before uploading (see below).
   </p>
-  <p>If you are uploading an attachment, you must also provide the <code>attachment_number</code> field. Because some cases share documents, the <code>pacer_case_id</code> field should also be provided, though it's not a required field if it's unknown.
+  <p>If you are uploading an attachment, you must also provide the <code>attachment_number</code> field. Note that if you are not uploading an attachment, no <code>attachment_number</code> should be provided, otherwise the document will be marked as an attachment.
+  </p>
+  <p>Because some cases share documents, the <code>pacer_case_id</code> field should also be provided, though it's not a required field if it's unknown.
   </p>
   <p><code>pacer_doc_id</code> is the number you see in URLs when purchasing documents on PACER and in the HTML when clicking document numbers on docket pages. For example, in the URL <code>ecf.flp.uscourts.gov/doc1/035021404350</code>, the <code>pacer_doc_id</code> is <code>035021404350</code>.
   </p>

diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py
@@ -257,6 +257,10 @@ async def process_recap_pdf(pk):
 
     logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq} ")
     try:
+        # Attempt to get RECAPDocument instance.
+        # It is possible for this instance to not have a document yet,
+        # so the document_type field will have a default value,
+        # which is why we do not use it to retrieve the RECAPDocument.
         if pq.pacer_case_id:
             rd = await RECAPDocument.objects.aget(
                 docket_entry__docket__pacer_case_id=pq.pacer_case_id,
@@ -267,6 +271,8 @@ async def process_recap_pdf(pk):
             # work anyway.
             rd = await RECAPDocument.objects.aget(pacer_doc_id=pq.pacer_doc_id)
     except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
+        # Try again but this time using Docket and Docket Entry to get
+        # the RECAPDocument instance. If not found, we create a new one.
         retries = 5
         while True:
             try:
@@ -301,7 +307,7 @@ async def process_recap_pdf(pk):
                 break
 
         # Got the Docket, attempt to get/create the DocketEntry, and then
-        # create the RECAPDocument
+        # get/create the RECAPDocument
         retries = 5
         while True:
             try:
@@ -335,24 +341,39 @@ async def process_recap_pdf(pk):
                         attachment_number=pq.attachment_number,
                         document_type=document_type,
                     )
-                except (
-                    RECAPDocument.DoesNotExist,
-                    RECAPDocument.MultipleObjectsReturned,
-                ):
+                except RECAPDocument.DoesNotExist:
                     # Unable to find it. Make a new item.
                     rd = RECAPDocument(
                         docket_entry=de,
                         pacer_doc_id=pq.pacer_doc_id,
                         document_type=document_type,
                     )
+                except RECAPDocument.MultipleObjectsReturned:
+                    # Multiple RECAPDocuments exist for this docket entry,
+                    # which is unexpected. Ideally, we should not create a new
+                    # RECAPDocument when multiples exist. However, since this
+                    # behavior has been in place for years, we're retaining it
+                    # for now. We've added Sentry logging to capture these
+                    # cases for future debugging.
+                    logger.error(
+                        "Multiple RECAPDocuments returned when processing pdf upload"
+                    )
+                    rd = RECAPDocument(
+                        docket_entry=de,
+                        pacer_doc_id=pq.pacer_doc_id,
+                        document_type=document_type,
+                    )
                 break
 
     # document_number field is a CharField in RECAPDocument and a
     # BigIntegerField in ProcessingQueue. To prevent the ES signal
     # processor fields tracker from detecting it as a value change, it should
     # be converted to a string.
     rd.document_number = str(pq.document_number)
+    # We update attachment_number and document_type in case the
+    # RECAPDocument didn't have the actual document yet.
     rd.attachment_number = pq.attachment_number
+    rd.document_type = document_type
 
     # Do the file, finally.
     try:
@@ -408,7 +429,7 @@ async def process_recap_pdf(pk):
         try:
             await rd.asave()
         except (IntegrityError, ValidationError):
-            msg = "Duplicate key on unique_together constraint"
+            msg = "Failed to save RECAPDocument (unique_together constraint or doc type issue)"
             await mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
             rd.filepath_local.delete(save=False)
             return None

diff --git a/cl/search/models.py b/cl/search/models.py
@@ -1,3 +1,4 @@
+import logging
 import re
 from datetime import datetime
 from typing import Any, Dict, List, Tuple, TypeVar
@@ -49,6 +50,8 @@
 
 HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")
 
+logger = logging.getLogger(__name__)
+
 
 class PRECEDENTIAL_STATUS:
     PUBLISHED = "Published"
@@ -1576,11 +1579,7 @@ def save(
         *args,
         **kwargs,
     ):
-        if self.document_type == self.ATTACHMENT:
-            if self.attachment_number is None:
-                raise ValidationError(
-                    "attachment_number cannot be null for an attachment."
-                )
+        self.clean()
 
         if self.pacer_doc_id is None:
             # Juriscraper returns these as null values. Instead we want blanks.
@@ -1660,6 +1659,26 @@ async def asave(
             **kwargs,
         )
 
+    def clean(self):
+        """
+        Validate that:
+        - Attachments must have an attachment_number.
+        - Main PACER documents must not have an attachment_number.
+        """
+        super().clean()
+        is_attachment = self.document_type == self.ATTACHMENT
+        has_attachment_number = self.attachment_number is not None
+        missing_attachment_number = is_attachment and not has_attachment_number
+        wrongly_added_att_num = not is_attachment and has_attachment_number
+        if missing_attachment_number or wrongly_added_att_num:
+            msg = (
+                "attachment_number cannot be null for an attachment."
+                if missing_attachment_number
+                else "attachment_number must be null for a main PACER document."
+            )
+            logger.error(msg)
+            raise ValidationError({"attachment_number": msg})
+
     def delete(self, *args, **kwargs):
         """
         Note that this doesn't get called when an entire queryset

diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py
@@ -447,6 +447,60 @@ def test_cannot_create_duplicate(self) -> None:
                 )
 
 
+class RECAPDocumentValidationTest(TestCase):
+    @classmethod
+    def setUpTestData(cls):
+        cls.docket_entry = DocketEntryWithParentsFactory()
+
+    def test_attachment_with_attachment_number(self):
+        """Attachments with attachment_number should not raise ValidationError."""
+        document = RECAPDocument.objects.create(
+            docket_entry=self.docket_entry,
+            document_type=RECAPDocument.ATTACHMENT,
+            attachment_number=1,
+        )
+        self.assertIsNotNone(document.id)
+
+    def test_attachment_without_attachment_number(self):
+        """Attachments without attachment_number should raise ValidationError."""
+        with self.assertRaises(ValidationError) as cm:
+            RECAPDocument.objects.create(
+                docket_entry=self.docket_entry,
+                document_type=RECAPDocument.ATTACHMENT,
+                attachment_number=None,
+            )
+        # Assert that the error message is as expected
+        self.assertIn("attachment_number", cm.exception.message_dict)
+        self.assertEqual(
+            cm.exception.message_dict["attachment_number"],
+            ["attachment_number cannot be null for an attachment."],
+        )
+
+    def test_main_document_with_attachment_number(self):
+        """Main PACER documents with attachment_number should raise ValidationError."""
+        with self.assertRaises(ValidationError) as cm:
+            RECAPDocument.objects.create(
+                docket_entry=self.docket_entry,
+                document_type=RECAPDocument.PACER_DOCUMENT,
+                attachment_number=1,
+            )
+        # Assert that the error message is as expected
+        self.assertIn("attachment_number", cm.exception.message_dict)
+        self.assertEqual(
+            cm.exception.message_dict["attachment_number"],
+            ["attachment_number must be null for a main PACER document."],
+        )
+
+    def test_main_document_without_attachment_number(self):
+        """Main PACER documents without attachment_number should not raise ValidationError."""
+        document = RECAPDocument.objects.create(
+            docket_entry=self.docket_entry,
+            document_type=RECAPDocument.PACER_DOCUMENT,
+            attachment_number=None,
+        )
+        self.assertIsNotNone(document.id)
+
+
 class IndexingTest(EmptySolrTestCase):
     """Are things indexed properly?"""
 
@@ -2402,6 +2456,7 @@ def setUpTestData(cls):
             docket_entry=cls.de,
             document_number="1",
             attachment_number=2,
+            document_type=RECAPDocument.ATTACHMENT,
         )
         cls.de_1 = DocketEntryWithParentsFactory(
             docket=DocketFactory(

diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py
@@ -5459,6 +5459,7 @@ def setUp(self):
             docket_entry=self.de,
             document_number="1",
             attachment_number=2,
+            document_type=RECAPDocument.ATTACHMENT,
         )
         self.de_1 = DocketEntryWithParentsFactory(
             docket=DocketFactory(

diff --git a/cl/settings/third_party/sentry.py b/cl/settings/third_party/sentry.py
@@ -52,4 +52,5 @@ def fingerprint_sentry_error(event: dict, hint: dict) -> dict:
         ],
         ignore_errors=[KeyboardInterrupt],
         before_send=fingerprint_sentry_error,
+        attach_stacktrace=True,
     )