diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index ff9c71b8b2..4a5503e27b 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -1781,7 +1781,7 @@ public static List cleanAbstractLayoutTokens(List toke public static void cleanTitles(BiblioItem bibl) { if (bibl.getTitle() != null) { String localTitle = TextUtilities.cleanField(bibl.getTitle(), false); - if (localTitle.endsWith(" y")) { + if (localTitle != null && localTitle.endsWith(" y")) { // some markers at the end of the title are extracted from the pdf as " y" at the end of the title // e.g. Computations in finite-dimensional Lie algebras y localTitle = localTitle.substring(0, localTitle.length() - 2); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java index 9f297ca729..4544343db0 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java @@ -161,6 +161,10 @@ public List processingLayoutTokenMultiple(List> to if (allRes == null || allRes.length() == 0) return null; String[] resBlocks = allRes.split("\n\n"); + if (resBlocks.length != tokenList.size()) { + LOGGER.error("didn't get the same number of citations as raw reference strings"); + throw new GrobidException("didn't get the same number of citations as raw reference strings"); + } int i = 0; for (List tokens : tokenList) { if (CollectionUtils.isEmpty(tokens)) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 1ca8824731..1021163b69 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -166,7 +166,7 @@ public List processRawReferences(List references, int consol return finalResults; List results = parsers.getCitationParser().processingStringMultiple(references, 0); - if (results.size() == 0) + if (results == null || results.size() == 0) return finalResults; // consolidation in a second stage to take advantage of parallel calls diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java index 59411093fe..4c39be813c 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestCitationParser.java @@ -7,9 +7,11 @@ import org.junit.*; import java.util.List; +import java.util.Arrays; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; @@ -102,4 +104,26 @@ public void testCitationParser5_withoutConsolidation() throws Exception { assertNotNull(resCitation.getFullAuthors()); } -} \ No newline at end of file + + @Test + public void testCitationParser6_withoutConsolidation() throws Exception { + // test handling of empty or whitespace-only citation strings + + String empty_citation = ""; + BiblioItem resEmptyCitation = engine.processRawReference(empty_citation, 0); + assertNull(resEmptyCitation); + + // these are non-breaking whitespace unicode characters + String nbsp_citation = "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"; + BiblioItem resNBSPCitation = engine.processRawReference(nbsp_citation, 0); + assertNull(resEmptyCitation); + + List all_whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0"); + List resAllWhitespaceCitationList = engine.processRawReferences(all_whitespace_citations, 0); + assertThat(resAllWhitespaceCitationList.size(), is(0)); + + List partial_whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0", "blah"); + List resPartialCitationList = engine.processRawReferences(partial_whitespace_citations, 0); + assertThat(resPartialCitationList.size(), is(1)); + } +} diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java index dbd78cf5b8..91de729585 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java @@ -317,6 +317,11 @@ public Response processCitationList(List citations, GrobidAnalysisConfig if (biblioItems == null || biblioItems.size() == 0) { response = Response.status(Status.NO_CONTENT).build(); + } else if (biblioItems.size() != citations.size()) { + LOGGER.error("Not all citation strings parsed"); + response = Response.status(Status.INTERNAL_SERVER_ERROR).build(); + } else if (biblioItems.size() == 0) { + response = Response.status(Status.NO_CONTENT).build(); } else if (expectedResponseType == ExpectedResponseType.BIBTEX) { StringBuilder responseContent = new StringBuilder(); int n = 0; @@ -339,6 +344,10 @@ public Response processCitationList(List citations, GrobidAnalysisConfig "\n\t\t\n\t\t\t
\n\t\t\t\t\n"); int n = 0; for(BiblioItem biblioItem : biblioItems) { + if (biblioItem == null) { + // insert an empty BiblioItem in reponse + biblioItem = new BiblioItem(); + } responseContent.append(biblioItem.toTEI(n, config)); responseContent.append("\n"); n++; diff --git a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java index 99454a2dff..50257195c9 100755 --- a/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java +++ b/grobid-service/src/test/java/org/grobid/service/tests/GrobidRestServiceTest.java @@ -351,6 +351,45 @@ public void processStatelessReferencesDocumentReturnsValidBibTeXForKolbAndKopp() "\n", response.readEntity(String.class)); } + @Test + public void processCitationEmptyString() { + Form form = new Form(); + form.param(GrobidRestService.CITATION, " "); + form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1"); + Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION) + .request() + .post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE)); + // just checking that HTTP status is 204 (empty) + assertEquals(Response.Status.NO_CONTENT.getStatusCode(), response.getStatus()); + } + + @Test + public void processCitationWhitespaceString() { + Form form = new Form(); + form.param(GrobidRestService.CITATION, "\t \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"); + form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1"); + Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION) + .request() + .post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE)); + // just checking that HTTP status is 204 (empty) + assertEquals(Response.Status.NO_CONTENT.getStatusCode(), response.getStatus()); + } + + @Test + public void processCitationListWhitespaceString() { + Form form = new Form(); + form.param(GrobidRestService.CITATION, ""); + form.param(GrobidRestService.CITATION, " "); + form.param(GrobidRestService.CITATION, "\t"); + form.param(GrobidRestService.CITATION, "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0"); + form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1"); + Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION_LIST) + .request() + .post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE)); + // just checking that HTTP status is 200 (success) + assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); + } + private String getStrResponse(File pdf, String method) { assertTrue("Cannot run the test, because the sample file '" + pdf + "' does not exists.", pdf.exists());