Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(RECAPDocument): Validate attachment number against document type #4703

Merged
merged 14 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cl/api/templates/recap-api-docs-vlatest.html
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,9 @@ <h4 id="recap-dockets">Uploading Dockets, History Reports, and Claims Registries
<h4 id="recap-pdf">Uploading PDFs</h4>
<p>To upload PDFs, include the <code>pacer_doc_id</code> and <code>document_number</code> fields. For documents originating from courts outside the new Appellate Case Management System (ACMS), the fourth digit of the <code>pacer_doc_id</code> must always be normalized to a zero before uploading (see below).
</p>
<p>If you are uploading an attachment, you must also provide the <code>attachment_number</code> field. Because some cases share documents, the <code>pacer_case_id</code> field should also be provided, though it's not a required field if it's unknown.
<p>If you are uploading an attachment, you must also provide the <code>attachment_number</code> field. Note that if you are not uploading an attachment, no <code>attachment_number</code> should be provided, otherwise the document will be marked as an attachment.
</p>
<p>Because some cases share documents, the <code>pacer_case_id</code> field should also be provided, though it's not a required field if it's unknown.
</p>
<p><code>pacer_doc_id</code> is the number you see in URLs when purchasing documents on PACER and in the HTML when clicking document numbers on docket pages. For example, in the URL <code>ecf.flp.uscourts.gov/doc1/035021404350</code>, the <code>pacer_doc_id</code> is <code>035021404350</code>.
</p>
Expand Down
33 changes: 27 additions & 6 deletions cl/recap/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ async def process_recap_pdf(pk):

logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq} ")
try:
# Attempt to get RECAPDocument instance.
# It is possible for this instance to not have a document yet,
# so the document_type field will have a default value,
# which is why we do not use it to retrieve the RECAPDocument.
if pq.pacer_case_id:
rd = await RECAPDocument.objects.aget(
docket_entry__docket__pacer_case_id=pq.pacer_case_id,
Expand All @@ -267,6 +271,8 @@ async def process_recap_pdf(pk):
# work anyway.
rd = await RECAPDocument.objects.aget(pacer_doc_id=pq.pacer_doc_id)
except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned):
# Try again but this time using Docket and Docket Entry to get
# the RECAPDocument instance. If not found, we create a new one.
retries = 5
while True:
try:
Expand Down Expand Up @@ -301,7 +307,7 @@ async def process_recap_pdf(pk):
break

# Got the Docket, attempt to get/create the DocketEntry, and then
# create the RECAPDocument
# get/create the RECAPDocument
retries = 5
while True:
try:
Expand Down Expand Up @@ -335,24 +341,39 @@ async def process_recap_pdf(pk):
attachment_number=pq.attachment_number,
document_type=document_type,
)
except (
RECAPDocument.DoesNotExist,
RECAPDocument.MultipleObjectsReturned,
):
except RECAPDocument.DoesNotExist:
# Unable to find it. Make a new item.
rd = RECAPDocument(
docket_entry=de,
pacer_doc_id=pq.pacer_doc_id,
document_type=document_type,
)
except RECAPDocument.MultipleObjectsReturned:
# Multiple RECAPDocuments exist for this docket entry,
# which is unexpected. Ideally, we should not create a new
# RECAPDocument when multiples exist. However, since this
# behavior has been in place for years, we're retaining it
# for now. We've added Sentry logging to capture these
# cases for future debugging.
logger.error(
"Multiple RECAPDocuments returned when processing pdf upload"
)
rd = RECAPDocument(
docket_entry=de,
pacer_doc_id=pq.pacer_doc_id,
document_type=document_type,
)
break

# document_number field is a CharField in RECAPDocument and a
# BigIntegerField in ProcessingQueue. To prevent the ES signal
# processor fields tracker from detecting it as a value change, it should
# be converted to a string.
rd.document_number = str(pq.document_number)
# We update attachment_number and document_type in case the
# RECAPDocument didn't have the actual document yet.
rd.attachment_number = pq.attachment_number
rd.document_type = document_type

# Do the file, finally.
try:
Expand Down Expand Up @@ -408,7 +429,7 @@ async def process_recap_pdf(pk):
try:
await rd.asave()
except (IntegrityError, ValidationError):
msg = "Duplicate key on unique_together constraint"
msg = "Failed to save RECAPDocument (unique_together constraint or doc type issue)"
await mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED)
rd.filepath_local.delete(save=False)
return None
Expand Down
29 changes: 24 additions & 5 deletions cl/search/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import re
from datetime import datetime
from typing import Any, Dict, List, Tuple, TypeVar
Expand Down Expand Up @@ -49,6 +50,8 @@

HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")

logger = logging.getLogger(__name__)


class PRECEDENTIAL_STATUS:
PUBLISHED = "Published"
Expand Down Expand Up @@ -1576,11 +1579,7 @@ def save(
*args,
**kwargs,
):
if self.document_type == self.ATTACHMENT:
if self.attachment_number is None:
raise ValidationError(
"attachment_number cannot be null for an attachment."
)
self.clean()
ERosendo marked this conversation as resolved.
Show resolved Hide resolved

if self.pacer_doc_id is None:
# Juriscraper returns these as null values. Instead we want blanks.
Expand Down Expand Up @@ -1660,6 +1659,26 @@ async def asave(
**kwargs,
)

def clean(self):
"""
Validate that:
- Attachments must have an attachment_number.
- Main PACER documents must not have an attachment_number.
"""
super().clean()
is_attachment = self.document_type == self.ATTACHMENT
has_attachment_number = self.attachment_number is not None
missing_attachment_number = is_attachment and not has_attachment_number
wrongly_added_att_num = not is_attachment and has_attachment_number
if missing_attachment_number or wrongly_added_att_num:
msg = (
"attachment_number cannot be null for an attachment."
if missing_attachment_number
else "attachment_number must be null for a main PACER document."
)
logger.error(msg)
raise ValidationError({"attachment_number": msg})

def delete(self, *args, **kwargs):
"""
Note that this doesn't get called when an entire queryset
Expand Down
55 changes: 55 additions & 0 deletions cl/search/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,60 @@ def test_cannot_create_duplicate(self) -> None:
)


class RECAPDocumentValidationTest(TestCase):
@classmethod
def setUpTestData(cls):
cls.docket_entry = DocketEntryWithParentsFactory()

def test_attachment_with_attachment_number(self):
"""Attachments with attachment_number should not raise ValidationError."""
document = RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.ATTACHMENT,
attachment_number=1,
)
self.assertIsNotNone(document.id)

def test_attachment_without_attachment_number(self):
"""Attachments without attachment_number should raise ValidationError."""
with self.assertRaises(ValidationError) as cm:
RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.ATTACHMENT,
attachment_number=None,
)
# Assert that the error message is as expected
self.assertIn("attachment_number", cm.exception.message_dict)
self.assertEqual(
cm.exception.message_dict["attachment_number"],
["attachment_number cannot be null for an attachment."],
)

def test_main_document_with_attachment_number(self):
"""Main PACER documents with attachment_number should raise ValidationError."""
with self.assertRaises(ValidationError) as cm:
RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.PACER_DOCUMENT,
attachment_number=1,
)
# Assert that the error message is as expected
self.assertIn("attachment_number", cm.exception.message_dict)
self.assertEqual(
cm.exception.message_dict["attachment_number"],
["attachment_number must be null for a main PACER document."],
)

def test_main_document_without_attachment_number(self):
"""Main PACER documents without attachment_number should not raise ValidationError."""
document = RECAPDocument.objects.create(
docket_entry=self.docket_entry,
document_type=RECAPDocument.PACER_DOCUMENT,
attachment_number=None,
)
self.assertIsNotNone(document.id)


class IndexingTest(EmptySolrTestCase):
"""Are things indexed properly?"""

Expand Down Expand Up @@ -2402,6 +2456,7 @@ def setUpTestData(cls):
docket_entry=cls.de,
document_number="1",
attachment_number=2,
document_type=RECAPDocument.ATTACHMENT,
)
cls.de_1 = DocketEntryWithParentsFactory(
docket=DocketFactory(
Expand Down
1 change: 1 addition & 0 deletions cl/search/tests/tests_es_recap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5459,6 +5459,7 @@ def setUp(self):
docket_entry=self.de,
document_number="1",
attachment_number=2,
document_type=RECAPDocument.ATTACHMENT,
)
self.de_1 = DocketEntryWithParentsFactory(
docket=DocketFactory(
Expand Down
1 change: 1 addition & 0 deletions cl/settings/third_party/sentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ def fingerprint_sentry_error(event: dict, hint: dict) -> dict:
],
ignore_errors=[KeyboardInterrupt],
before_send=fingerprint_sentry_error,
attach_stacktrace=True,
)