Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: handle whitespace-only citations in processCitationList / processRawReferences #851

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1781,7 +1781,7 @@ public static List<LayoutToken> cleanAbstractLayoutTokens(List<LayoutToken> toke
public static void cleanTitles(BiblioItem bibl) {
if (bibl.getTitle() != null) {
String localTitle = TextUtilities.cleanField(bibl.getTitle(), false);
if (localTitle.endsWith(" y")) {
if (localTitle != null && localTitle.endsWith(" y")) {
// some markers at the end of the title are extracted from the pdf as " y" at the end of the title
// e.g. <title level="a" type="main">Computations in finite-dimensional Lie algebras y</title>
localTitle = localTitle.substring(0, localTitle.length() - 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ public List<BiblioItem> processingLayoutTokenMultiple(List<List<LayoutToken>> to
if (allRes == null || allRes.length() == 0)
return null;
String[] resBlocks = allRes.split("\n\n");
if (resBlocks.length != tokenList.size()) {
LOGGER.error("didn't get the same number of citations as raw reference strings");
throw new GrobidException("didn't get the same number of citations as raw reference strings");
}
int i = 0;
for (List<LayoutToken> tokens : tokenList) {
if (CollectionUtils.isEmpty(tokens))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ public List<BiblioItem> processRawReferences(List<String> references, int consol
return finalResults;

List<BiblioItem> results = parsers.getCitationParser().processingStringMultiple(references, 0);
if (results.size() == 0)
if (results == null || results.size() == 0)
return finalResults;

// consolidation in a second stage to take advantage of parallel calls
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import org.junit.*;

import java.util.List;
import java.util.Arrays;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

Expand Down Expand Up @@ -102,4 +104,26 @@ public void testCitationParser5_withoutConsolidation() throws Exception {
assertNotNull(resCitation.getFullAuthors());

}
}

@Test
public void testCitationParser6_withoutConsolidation() throws Exception {
// test handling of empty or whitespace-only citation strings

String empty_citation = "";
BiblioItem resEmptyCitation = engine.processRawReference(empty_citation, 0);
assertNull(resEmptyCitation);

// these are non-breaking whitespace unicode characters
String nbsp_citation = "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0";
BiblioItem resNBSPCitation = engine.processRawReference(nbsp_citation, 0);
assertNull(resEmptyCitation);

List<String> all_whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0");
List<BiblioItem> resAllWhitespaceCitationList = engine.processRawReferences(all_whitespace_citations, 0);
assertThat(resAllWhitespaceCitationList.size(), is(0));

List<String> partial_whitespace_citations = Arrays.asList("", "\t", " ", "\u00a0\u00a0\u00a0\u00a0\u00a0", "blah");
List<BiblioItem> resPartialCitationList = engine.processRawReferences(partial_whitespace_citations, 0);
assertThat(resPartialCitationList.size(), is(1));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,11 @@ public Response processCitationList(List<String> citations, GrobidAnalysisConfig

if (biblioItems == null || biblioItems.size() == 0) {
response = Response.status(Status.NO_CONTENT).build();
} else if (biblioItems.size() != citations.size()) {
LOGGER.error("Not all citation strings parsed");
response = Response.status(Status.INTERNAL_SERVER_ERROR).build();
} else if (biblioItems.size() == 0) {
response = Response.status(Status.NO_CONTENT).build();
} else if (expectedResponseType == ExpectedResponseType.BIBTEX) {
StringBuilder responseContent = new StringBuilder();
int n = 0;
Expand All @@ -339,6 +344,10 @@ public Response processCitationList(List<String> citations, GrobidAnalysisConfig
"<body/>\n\t\t<back>\n\t\t\t<div>\n\t\t\t\t<listBibl>\n");
int n = 0;
for(BiblioItem biblioItem : biblioItems) {
if (biblioItem == null) {
// insert an empty BiblioItem in reponse
biblioItem = new BiblioItem();
}
responseContent.append(biblioItem.toTEI(n, config));
responseContent.append("\n");
n++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,45 @@ public void processStatelessReferencesDocumentReturnsValidBibTeXForKolbAndKopp()
"\n", response.readEntity(String.class));
}

@Test
public void processCitationEmptyString() {
Form form = new Form();
form.param(GrobidRestService.CITATION, " ");
form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1");
Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION)
.request()
.post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE));
// just checking that HTTP status is 204 (empty)
assertEquals(Response.Status.NO_CONTENT.getStatusCode(), response.getStatus());
}

@Test
public void processCitationWhitespaceString() {
Form form = new Form();
form.param(GrobidRestService.CITATION, "\t \u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0");
form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1");
Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION)
.request()
.post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE));
// just checking that HTTP status is 204 (empty)
assertEquals(Response.Status.NO_CONTENT.getStatusCode(), response.getStatus());
}

@Test
public void processCitationListWhitespaceString() {
Form form = new Form();
form.param(GrobidRestService.CITATION, "");
form.param(GrobidRestService.CITATION, " ");
form.param(GrobidRestService.CITATION, "\t");
form.param(GrobidRestService.CITATION, "\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0");
form.param(GrobidRestService.INCLUDE_RAW_CITATIONS, "1");
Response response = getClient().target(baseUrl()).path(GrobidPaths.PATH_CITATION_LIST)
.request()
.post(Entity.entity(form, MediaType.APPLICATION_FORM_URLENCODED_TYPE));
// just checking that HTTP status is 200 (success)
assertEquals(Response.Status.OK.getStatusCode(), response.getStatus());
}

private String getStrResponse(File pdf, String method) {
assertTrue("Cannot run the test, because the sample file '" + pdf + "' does not exists.", pdf.exists());

Expand Down