From f75fd746d7f4895e765efba49c699626cb4c0255 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 8 Nov 2021 14:34:10 -0800 Subject: [PATCH 1/6] add regression tests for non-breaking whitespace processCitationList HTTP 5xx --- .../grobid/core/test/TestCitationParser.java | 23 ++++++++++- .../service/tests/GrobidRestServiceTest.java | 39 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java index 59411093fe..e85a0cbf18 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java @@ -7,9 +7,11 @@ import org.junit.*; import java.util.List; +import java.util.Arrays; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; @@ -102,4 +104,23 @@ public void testCitationParser5_withoutConsolidation() throws Exception { assertNotNull(resCitation.getFullAuthors()); } -} \ No newline at end of file + + @Test + public void testCitationParser6_withoutConsolidation() throws Exception { + // test handling of empty or whitespace-only citation strings + + String empty_citation = ""; + BiblioItem resEmptyCitation = engine.processRawReference(empty_citation, 0); + assertNull(resEmptyCitation); + + // these are non-breaking whitespace unicode characters + //String nbsp_citation = " "; + String nbsp_citation = "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"; + BiblioItem resNBSPCitation = engine.processRawReference(nbsp_citation, 0); + assertNull(resEmptyCitation); + + List whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0"); + List resCitationList = engine.processRawReferences(whitespace_citations, 0); + assertThat(resCitationList.size(), is(0)); + } +} diff --git a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java index 99454a2dff..50257195c9 100755 --- a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java +++ b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java @@ -351,6 +351,45 @@ public void processStatelessReferencesDocumentReturnsValidBibTeXForKolbAndKopp() "\n", response.readEntity(String.class)); } + @Test + public void processCitationEmptyString() { + Form form = new Form(); + form.param(GrobidRestService.CITATION, " "); + form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1"); + Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION) + .request() + .post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE)); + // just checking that HTTP status is 204 (empty) + assertEquals(Response.Status.NO_CONTENT.getStatusCode(), response.getStatus()); + } + + @Test + public void processCitationWhitespaceString() { + Form form = new Form(); + form.param(GrobidRestService.CITATION, "\t \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"); + form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1"); + Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION) + .request() + .post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE)); + // just checking that HTTP status is 204 (empty) + assertEquals(Response.Status.NO_CONTENT.getStatusCode(), response.getStatus()); + } + + @Test + public void processCitationListWhitespaceString() { + Form form = new Form(); + form.param(GrobidRestService.CITATION, ""); + form.param(GrobidRestService.CITATION, " "); + form.param(GrobidRestService.CITATION, "\t"); + form.param(GrobidRestService.CITATION, "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"); + form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1"); + Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION_LIST) + .request() + .post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE)); + // just checking that HTTP status is 200 (success) + assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); + } + private String getStrResponse(File pdf, String method) { assertTrue("Cannot run the test, because the sample file '" + pdf + "' does not exists.", pdf.exists()); From fd593e92abea47fdaacb4c5eac0fc7e0c71b9fba Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 8 Nov 2021 15:08:16 -0800 Subject: [PATCH 2/6] fix NullExceptionError on whitespace-only raw reference lists --- grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index ff9c71b8b2..4a5503e27b 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -1781,7 +1781,7 @@ public static List cleanAbstractLayoutTokens(List toke public static void cleanTitles(BiblioItem bibl) { if (bibl.getTitle() != null) { String localTitle = TextUtilities.cleanField(bibl.getTitle(), false); - if (localTitle.endsWith(" y")) { + if (localTitle != null && localTitle.endsWith(" y")) { // some markers at the end of the title are extracted from the pdf as " y" at the end of the title // e.g. Computations in finite-dimensional Lie algebras y localTitle = localTitle.substring(0, localTitle.length() - 2); From 31b0669135d7ed7dfc9a329a50e212634965833a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 8 Nov 2021 15:43:43 -0800 Subject: [PATCH 3/6] verify that citation list responses are the same size and number of raw references supplied --- .../main/java/org/grobid/core/engines/CitationParser.java | 4 ++++ .../org/grobid/service/process/GrobidRestProcessString.java | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java index 9f297ca729..4544343db0 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java @@ -161,6 +161,10 @@ public List processingLayoutTokenMultiple(List> to if (allRes == null || allRes.length() == 0) return null; String[] resBlocks = allRes.split("\n\n"); + if (resBlocks.length != tokenList.size()) { + LOGGER.error("didn't get the same number of citations as raw reference strings"); + throw new GrobidException("didn't get the same number of citations as raw reference strings"); + } int i = 0; for (List tokens : tokenList) { if (CollectionUtils.isEmpty(tokens)) diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java index dbd78cf5b8..1cf29bae40 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java @@ -317,6 +317,11 @@ public Response processCitationList(List citations, GrobidAnalysisConfig if (biblioItems == null || biblioItems.size() == 0) { response = Response.status(Status.NO_CONTENT).build(); + } else if (biblioItems.size() != citations.size()) { + LOGGER.error("Not all citation strings parsed"); + response = Response.status(Status.INTERNAL_SERVER_ERROR).build(); + } else if (biblioItems.size() == 0) { + response = Response.status(Status.NO_CONTENT).build(); } else if (expectedResponseType == ExpectedResponseType.BIBTEX) { StringBuilder responseContent = new StringBuilder(); int n = 0; From 31341a3df83dc9b5ebeaa25b521cbb04be45b7af Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 8 Nov 2021 15:44:30 -0800 Subject: [PATCH 4/6] handle all-empty citation list case --- grobid-core/src/main/java/org/grobid/core/engines/Engine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 1ca8824731..1021163b69 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -166,7 +166,7 @@ public List processRawReferences(List references, int consol return finalResults; List results = parsers.getCitationParser().processingStringMultiple(references, 0); - if (results.size() == 0) + if (results == null || results.size() == 0) return finalResults; // consolidation in a second stage to take advantage of parallel calls From d4052740f4ad66d57b5947117f7cd3ba50af4870 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 8 Nov 2021 15:45:49 -0800 Subject: [PATCH 5/6] insert empty biblioItems in citation list responses --- .../org/grobid/service/process/GrobidRestProcessString.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java index 1cf29bae40..91de729585 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java @@ -344,6 +344,10 @@ public Response processCitationList(List citations, GrobidAnalysisConfig "\n\t\t\n\t\t\t
\n\t\t\t\t\n"); int n = 0; for(BiblioItem biblioItem : biblioItems) { + if (biblioItem == null) { + // insert an empty BiblioItem in reponse + biblioItem = new BiblioItem(); + } responseContent.append(biblioItem.toTEI(n, config)); responseContent.append("\n"); n++; From a8ba04e64f6e96116ef31ebf8e4ae3ae5f1ad8aa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 8 Nov 2021 15:46:28 -0800 Subject: [PATCH 6/6] additional test for partially whitespace citation lists --- .../java/org/grobid/core/test/TestCitationParser.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java index e85a0cbf18..4c39be813c 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java @@ -114,13 +114,16 @@ public void testCitationParser6_withoutConsolidation() throws Exception { assertNull(resEmptyCitation); // these are non-breaking whitespace unicode characters - //String nbsp_citation = " "; String nbsp_citation = "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"; BiblioItem resNBSPCitation = engine.processRawReference(nbsp_citation, 0); assertNull(resEmptyCitation); - List whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0"); - List resCitationList = engine.processRawReferences(whitespace_citations, 0); - assertThat(resCitationList.size(), is(0)); + List all_whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0"); + List resAllWhitespaceCitationList = engine.processRawReferences(all_whitespace_citations, 0); + assertThat(resAllWhitespaceCitationList.size(), is(0)); + + List partial_whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0", "blah"); + List resPartialCitationList = engine.processRawReferences(partial_whitespace_citations, 0); + assertThat(resPartialCitationList.size(), is(1)); } }