Skip to content

Commit

Permalink
Merge pull request #4703 from freelawproject/recap-document-validation
Browse files Browse the repository at this point in the history
feat(RECAPDocument): Validate attachment number against document type
  • Loading branch information
mlissner authored Nov 19, 2024
2 parents 8c86927 + af417c9 commit 99c184a
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 12 deletions.
4 changes: 3 additions & 1 deletion cl/api/templates/recap-api-docs-vlatest.html
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,9 @@ <h4 id="recap-dockets">Uploading Dockets, History Reports, and Claims Registries
<h4 id="recap-pdf">Uploading PDFs</h4>
<p>To upload PDFs, include the <code>pacer_doc_id</code> and <code>document_number</code> fields. For documents originating from courts outside the new Appellate Case Management System (ACMS), the fourth digit of the <code>pacer_doc_id</code> must always be normalized to a zero before uploading (see below).
</p>
<p>If you are uploading an attachment, you must also provide the <code>attachment_number</code> field. Because some cases share documents, the <code>pacer_case_id</code> field should also be provided, though it's not a required field if it's unknown.
<p>If you are uploading an attachment, you must also provide the <code>attachment_number</code> field. Note that if you are not uploading an attachment, no <code>attachment_number</code> should be provided, otherwise the document will be marked as an attachment.
</p>
<p>Because some cases share documents, the <code>pacer_case_id</code> field should also be provided, though it's not a required field if it's unknown.
</p>
<p><code>pacer_doc_id</code> is the number you see in URLs when purchasing documents on PACER and in the HTML when clicking document numbers on docket pages. For example, in the URL <code>ecf.flp.uscourts.gov/doc1/035021404350</code>, the <code>pacer_doc_id</code> is <code>035021404350</code>.
</p>
Expand Down
33 changes: 27 additions & 6 deletions cl/recap/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ async def process_recap_pdf(pk):

logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq} ")
try:
# Attempt to get RECAPDocument instance.
# It is possible for this instance to not have a document yet,
# so the document_type field will have a default value,
# which is why we do not use it to retrieve the RECAPDocument.
if pq.pacer_case_id:
rd = await RECAPDocument.objects.aget(
docket_entry__docket__pacer_case_id=pq.pacer_case_id,
Expand All @@ -267,6 +271,8 @@ async def process_recap_pdf(pk):
# work anyway.
rd = await RECAPDocument.objects.aget(pacer_doc_id=pq.pacer_doc_id)
except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
# Try again but this time using Docket and Docket Entry to get
# the RECAPDocument instance. If not found, we create a new one.
retries = 5
while True:
try:
Expand Down Expand Up @@ -301,7 +307,7 @@ async def process_recap_pdf(pk):
break

# Got the Docket, attempt to get/create the DocketEntry, and then
# create the RECAPDocument
# get/create the RECAPDocument
retries = 5
while True:
try:
Expand Down Expand Up @@ -335,24 +341,39 @@ async def process_recap_pdf(pk):
attachment_number=pq.attachment_number,
document_type=document_type,
)
except (
RECAPDocument.DoesNotExist,
RECAPDocument.MultipleObjectsReturned,
):
except RECAPDocument.DoesNotExist:
# Unable to find it. Make a new item.
rd = RECAPDocument(
docket_entry=de,
pacer_doc_id=pq.pacer_doc_id,
document_type=document_type,
)
except RECAPDocument.MultipleObjectsReturned:
# Multiple RECAPDocuments exist for this docket entry,
# which is unexpected. Ideally, we should not create a new
# RECAPDocument when multiples exist. However, since this
# behavior has been in place for years, we're retaining it
# for now. We've added Sentry logging to capture these
# cases for future debugging.
logger.error(
"Multiple RECAPDocuments returned when processing pdf upload"
)
rd = RECAPDocument(
docket_entry=de,
pacer_doc_id=pq.pacer_doc_id,
document_type=document_type,
)
break

# document_number field is a CharField in RECAPDocument and a
# BigIntegerField in ProcessingQueue. To prevent the ES signal
# processor fields tracker from detecting it as a value change, it should
# be converted to a string.
rd.document_number = str(pq.document_number)
# We update attachment_number and document_type in case the
# RECAPDocument didn't have the actual document yet.
rd.attachment_number = pq.attachment_number
rd.document_type = document_type

# Do the file, finally.
try:
Expand Down Expand Up @@ -408,7 +429,7 @@ async def process_recap_pdf(pk):
try:
await rd.asave()
except (IntegrityError, ValidationError):
msg = "Duplicate key on unique_together constraint"
msg = "Failed to save RECAPDocument (unique_together constraint or doc type issue)"
await mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
rd.filepath_local.delete(save=False)
return None
Expand Down
29 changes: 24 additions & 5 deletions cl/search/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import re
from datetime import datetime
from typing import Any, Dict, List, Tuple, TypeVar
Expand Down Expand Up @@ -49,6 +50,8 @@

HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")

logger = logging.getLogger(__name__)


class PRECEDENTIAL_STATUS:
PUBLISHED = "Published"
Expand Down Expand Up @@ -1576,11 +1579,7 @@ def save(
*args,
**kwargs,
):
if self.document_type == self.ATTACHMENT:
if self.attachment_number is None:
raise ValidationError(
"attachment_number cannot be null for an attachment."
)
self.clean()

if self.pacer_doc_id is None:
# Juriscraper returns these as null values. Instead we want blanks.
Expand Down Expand Up @@ -1660,6 +1659,26 @@ async def asave(
**kwargs,
)

def clean(self):
"""
Validate that:
- Attachments must have an attachment_number.
- Main PACER documents must not have an attachment_number.
"""
super().clean()
is_attachment = self.document_type == self.ATTACHMENT
has_attachment_number = self.attachment_number is not None
missing_attachment_number = is_attachment and not has_attachment_number
wrongly_added_att_num = not is_attachment and has_attachment_number
if missing_attachment_number or wrongly_added_att_num:
msg = (
"attachment_number cannot be null for an attachment."
if missing_attachment_number
else "attachment_number must be null for a main PACER document."
)
logger.error(msg)
raise ValidationError({"attachment_number": msg})

def delete(self, *args, **kwargs):
"""
Note that this doesn't get called when an entire queryset
Expand Down
55 changes: 55 additions & 0 deletions cl/search/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,60 @@ def test_cannot_create_duplicate(self) -> None:
)


class RECAPDocumentValidationTest(TestCase):
@classmethod
def setUpTestData(cls):
cls.docket_entry = DocketEntryWithParentsFactory()

def test_attachment_with_attachment_number(self):
"""Attachments with attachment_number should not raise ValidationError."""
document = RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.ATTACHMENT,
attachment_number=1,
)
self.assertIsNotNone(document.id)

def test_attachment_without_attachment_number(self):
"""Attachments without attachment_number should raise ValidationError."""
with self.assertRaises(ValidationError) as cm:
RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.ATTACHMENT,
attachment_number=None,
)
# Assert that the error message is as expected
self.assertIn("attachment_number", cm.exception.message_dict)
self.assertEqual(
cm.exception.message_dict["attachment_number"],
["attachment_number cannot be null for an attachment."],
)

def test_main_document_with_attachment_number(self):
"""Main PACER documents with attachment_number should raise ValidationError."""
with self.assertRaises(ValidationError) as cm:
RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.PACER_DOCUMENT,
attachment_number=1,
)
# Assert that the error message is as expected
self.assertIn("attachment_number", cm.exception.message_dict)
self.assertEqual(
cm.exception.message_dict["attachment_number"],
["attachment_number must be null for a main PACER document."],
)

def test_main_document_without_attachment_number(self):
"""Main PACER documents without attachment_number should not raise ValidationError."""
document = RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.PACER_DOCUMENT,
attachment_number=None,
)
self.assertIsNotNone(document.id)


class IndexingTest(EmptySolrTestCase):
"""Are things indexed properly?"""

Expand Down Expand Up @@ -2402,6 +2456,7 @@ def setUpTestData(cls):
docket_entry=cls.de,
document_number="1",
attachment_number=2,
document_type=RECAPDocument.ATTACHMENT,
)
cls.de_1 = DocketEntryWithParentsFactory(
docket=DocketFactory(
Expand Down
1 change: 1 addition & 0 deletions cl/search/tests/tests_es_recap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5459,6 +5459,7 @@ def setUp(self):
docket_entry=self.de,
document_number="1",
attachment_number=2,
document_type=RECAPDocument.ATTACHMENT,
)
self.de_1 = DocketEntryWithParentsFactory(
docket=DocketFactory(
Expand Down
1 change: 1 addition & 0 deletions cl/settings/third_party/sentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ def fingerprint_sentry_error(event: dict, hint: dict) -> dict:
],
ignore_errors=[KeyboardInterrupt],
before_send=fingerprint_sentry_error,
attach_stacktrace=True,
)

0 comments on commit 99c184a

Please sign in to comment.