diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py index 89056f2b34..ea7d965f9e 100644 --- a/cl/citations/annotate_citations.py +++ b/cl/citations/annotate_citations.py @@ -16,6 +16,8 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None: :param document: The Opinion or RECAPDocument whose text should be parsed """ + + # We prefer CAP data (xml_harvard) first. for attr in [ "xml_harvard", "html_anon_2020", diff --git a/cl/citations/tests.py b/cl/citations/tests.py index f4cb51d575..59b1e7a986 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -290,7 +290,7 @@ def test_make_html_from_harvard_xml(self) -> None: s=s, expected_html=expected_html, ): - opinion = Opinion(html=s) + opinion = Opinion(xml_harvard=s) get_and_clean_opinion_text(opinion) citations = get_citations( opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER