From 26204e884485f51a4f7e5e7808464dd9431acf9c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 12 Nov 2021 17:00:01 -0800 Subject: [PATCH 1/3] expand scope of arxiv identifier matcher The simple part of this is allowing 'arxiv:' in addition to 'arXiv:'. The more complex second part is to conservatively match "old" (pre-2008) style identifiers which do not have a prefix. The conservative matching is because there is less confidence that a string is actually an arxiv identifier without the prefix. Explicit collection prefixes are included (for those that existed pre-2008), internal whitespace is not allowed, and the identifier must be separated from other alphabetic strings. --- .../org/grobid/core/utilities/TextUtilities.java | 6 +++++- .../core/lexicon/LexiconIntegrationTest.java | 14 +++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index 4ab3117782..fd08517438 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -58,8 +58,12 @@ public class TextUtilities { // a regular expression for arXiv identifiers // see https://arxiv.org/help/arxiv_identifier and https://arxiv.org/help/arxiv_identifier_for_services + // three pattern types are allowed, here are examples of each + // "new style" with prefix: 'arXiv:0706.0002v3', 'arxiv: 0706.0002' + // "old style" with prefix: 'arXiv : hep-th/9901001v2', 'arxiv:hep-th/ 9901001' + // "old style" without prefix (strict): 'hep-th/9901001v2', 'math/9901001' static public final Pattern arXivPattern = Pattern - .compile("(arXiv\\s?(\\.org)?\\s?\\:\\s?\\d{4}\\s?\\.\\s?\\d{4,5}(v\\d+)?)|(arXiv\\s?(\\.org)?\\s?\\:\\s?[ a-zA-Z\\-\\.]*\\s?/\\s?\\d{7}(v\\d+)?)"); + .compile("(ar[xX]iv\\s?(\\.org)?\\s?\\:\\s??\\d{4}\\s?\\.\\s?\\d{4,5}(v\\d+)?)|(ar[xX]iv\\s?(\\.org)?\\s?\\:\\s?[ a-zA-Z\\-\\.]{3,16}\\s?/\\s?\\d{7}(v\\d+)?)|([^a-zA-Z](math|hep|astro|cond|gr|nucl|quat|stat|physics|cs|nlim|q\\-bio|q\\-fin)[a-zA-Z\\-\\.]*/\\d{7}(v\\d+)?)"); // regular expression for PubMed identifiers, last group gives the PMID digits static public final Pattern pmidPattern = Pattern.compile("((PMID)|(Pub(\\s)?Med(\\s)?(ID)?))(\\s)?(\\:)?(\\s)*(\\d{1,8})"); diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java index 231cca1b60..1d5ac901bd 100755 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconIntegrationTest.java @@ -345,6 +345,18 @@ public void testInArXivPatternLayoutToken2() { assertThat(positions.get(0).end, is(15)); } + @Test + public void testInArXivPatternLayoutToken3() { + String piece = "K.R. Dienes, C. Kolda and J. March-Russell, hep-ph/9610479."; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(piece); + String text = LayoutTokensUtil.toText(tokens); + List positions = target.tokenPositionsArXivPattern(tokens, text); + + assertThat(positions, hasSize(1)); + assertThat(positions.get(0).start, is(22)); + assertThat(positions.get(0).end, is(27)); + } + @Test public void testInIdentifierPatternLayoutToken() { String piece = "ATLAS collaboration, Measurements of the Nuclear Modification Factor for Jets in Pb+Pb Collisionsat √ "+ @@ -396,4 +408,4 @@ public void testInEmailPatternLayoutToken() { assertThat(positions.get(1).start, is(27)); assertThat(positions.get(1).end, is(33)); } -} \ No newline at end of file +} From 8f03c4929f478cf2315b4e571a3b5c67c50c5e41 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 12 Nov 2021 17:28:12 -0800 Subject: [PATCH 2/3] citation training data: annotate which are arxiv identifiers --- .../citation/corpus/16-1708.00362.training.references.tei.xml | 2 +- .../citation/corpus/1708.00649.training.references.tei.xml | 4 ++-- .../citation/corpus/1708.00867.training.references.tei.xml | 4 ++-- .../citation/corpus/1708.00957.training.references.tei.xml | 4 ++-- .../citation/corpus/1708.01150.training.references.tei.xml | 2 +- .../citation/corpus/27-1708.04230.training.references.tei.xml | 2 +- .../43-2007_31_1181-1184_853674.training.references.tei.xml | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/grobid-trainer/resources/dataset/citation/corpus/16-1708.00362.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/16-1708.00362.training.references.tei.xml index 068c158814..b18f660445 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/16-1708.00362.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/16-1708.00362.training.references.tei.xml @@ -25,7 +25,7 @@ P. Silvi, E. Rico, T. Calarco, S. Montangero, Lattice gauge tensor networks, New Journal of Physics 16 (10) (2014)103015 .URL http: //stacks.iop.org/1367-2630/16/i=10/a=103015 B. Buyens, K. Van Acoleyen, J. Haegeman, F. Verstraete, Matrix product states for Hamiltonian lattice gauge theories,ArXiv e -printsarXiv:1411.0020. E. Rico, T. Pichler, M. Dalmonte, P. Zoller, S. Montangero, Tensor networks for lattice gauge theories and atomic quantum simulat Phys.Rev .Lett .112 ( 2014)201601 .doi :10.1103/PhysRevLett.112.201601.URL https: //link.aps.org/doi/10.1103/PhysRevLett.112.201601 - H. Saito, M. C. Bañuls, K. Cichy, J. I. Cirac, K. Jansen, The temperature dependence of the chiral condensate inthe Schwinger model with Matrix Product States ,PoS LATTICE2014 ( 2014)302 .arXiv :1412.0596. + H. Saito, M. C. Bañuls, K. Cichy, J. I. Cirac, K. Jansen, The temperature dependence of the chiral condensate inthe Schwinger model with Matrix Product States ,PoS LATTICE2014 ( 2014)302 .arXiv :1412.0596. S. Kühn, E. Zohar, J. I. Cirac, M. C. Bañuls, Non-abelian string breaking phenomena with matrix product states, Journalof High Energy Physics 2015 ( 7)( 2015)130 .doi :10.1007/JHEP07(2015)130.URL https: //doi.org/10.1007/JHEP07(2015)130 M. C. Bañuls, K. Cichy, J. I. Cirac, K. Jansen, H. Saito, Thermal evolution of the schwinger model with matrix product operators, Phys.Rev .D 92 ( 2015)034519 .doi :10.1103/PhysRevD.92.034519.URL https: //link.aps.org/doi/10.1103/PhysRevD.92.034519 M. C. Bañuls, K. Cichy, K. Jansen, H. Saito, Chiral condensate in the schwinger model with matrix product operators, Phys.Rev .D 93 ( 2016)094512 .doi :10.1103/PhysRevD.93.094512.URL https: //link.aps.org/doi/10.1103/PhysRevD.93.094512 diff --git a/grobid-trainer/resources/dataset/citation/corpus/1708.00649.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/1708.00649.training.references.tei.xml index 978f3a1a21..8595ba0886 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/1708.00649.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/1708.00649.training.references.tei.xml @@ -29,7 +29,7 @@ T.Korzec,U .Wolff ,Nucl .Phys .B 871( 2013) 145 [ arXiv:1212 . 2875[hep-lat] ]. P.N.Meisinger ,M.C. Ogilvie, arXiv:1306. 1495[ hep-lat]. C.Gattringer,A. Schmidt, Phys. Rev. D86 (2012 )094506 [arXiv : 1208.6472 [ hep-lat]]. - Y.D.Mercado,C .Gattringer,A .Schmidt ,Phys .Rev .Lett .111 (2013) 14, 141601[ arXiv :1307. 6120[ hep -lat]]; + Y.D.Mercado,C .Gattringer,A .Schmidt ,Phys .Rev .Lett .111 (2013) 14, 141601[ arXiv :1307. 6120[ hep -lat]]; Comput .Phys.Commun.184( 2013) 1535[ arXiv: 1211. 3436[hep - lat]]. A. Schmidt,P.deForcrand, C. Gattringer, PoSLattice 2014 (2015 )209 [arXiv : 1501. 06472 [hep- lat ]]. C. Gattringer, T. Kloiber, M. Müller-Preussker, Phys. Rev. D 92 (2015) no.11, 114508 [arXiv:1508.00681[ hep-lat]]. @@ -66,7 +66,7 @@ H. Fukaya , T. Onogi, Phys. Rev. D 68 (2003) 074503 [hep-lat/0305004]; Phys. Rev. D 70 (2004)054508 [ hep-lat/0403024]. S. Dürr, Phys. Rev. D 85 (2012) 114503 [arXiv:1203.2560 [hep-lat]]; - PoS LAT 2005 (2006)021 [ hep-lat/0509026]. + PoS LAT 2005 (2006)021 [ hep-lat/0509026]. S .Dürr ,C .Hölbling ,Phys .Rev .D 69 ( 2004)034503 [ hep-lat/0311002]; Phys. Rev. D69 ( 2004 )034503[ hep -lat/0311002]. C. Gattringer, O. Orasch, work in preparation. diff --git a/grobid-trainer/resources/dataset/citation/corpus/1708.00867.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/1708.00867.training.references.tei.xml index 55945ad8db..ea1787a2dd 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/1708.00867.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/1708.00867.training.references.tei.xml @@ -12,7 +12,7 @@ A. Abdesselam et al., Eur. Phys. J. C71, 1661 (2011), arXiv:1012.5412[ hep-ph]. A. Altheimer et al., J. Phys. G39, 063001 (2012), arXiv:1201.0008[ hep-ph]. A. Altheimer et al., Eur. Phys. J. C74, 2792 (2014), arXiv:1311.2708[ hep-ex]. - D. Adams et al., Eur. Phys. J. C75, 409 (2015), arXiv:1504.00679[ hep-ph]. + D. Adams et al., Eur. Phys. J. C75, 409 (2015), arXiv:1504.00679[ hep-ph]. J. Gallicchio and M. D. Schwartz, Phys. Rev. Lett. 107, 172001( 2011),arXiv :1106.3076[ hep-ph]. J. Gallicchio and M. D. Schwartz, JHEP 04, 090 (2013), arXiv:1211.7038[ hep-ph]. A. J. Larkoski, G. P. Salam, and J. Thaler, JHEP 06, 108( 2013),arXiv :1305.0007[ hep-ph]. @@ -37,7 +37,7 @@ M. D. Schwartz, Phys. Rev. D77, 014026 (2008), arXiv:0709.2709[ hep-ph]. G. P. Korchemsky and A. V. Radyushkin, Nucl. Phys. B 283,342 ( 1987). I. Moult, I. W. Stewart, F. J. Tackmann, andW .J. Waalewijn, Phys. Rev. D93, 094003( 2016 ),arXiv: 1508.02397[hep- ph]. - C. Balzereit, T. Mannel, and W. Kilian, Phys. Rev. D58, 114029( 1998),arXiv :hep-ph/9805297. + C. Balzereit, T. Mannel, and W. Kilian, Phys. Rev. D58, 114029( 1998),arXiv :hep-ph/9805297. M. Neubert, Eur. Phys. J. C40, 165 (2005), arXiv:hep-ph/0408179. S. Fleming, A. H. Hoang, S. Mantry, and I. W. Stewart, Phys.Rev .D77 ,114003 ( 2008),arXiv :0711.2079[ hep-ph]. G. Kramer and B. Lampe, Z. Phys. C34, 497 (1987), [Erratum:Z .Phys .C42,504(1989)]. diff --git a/grobid-trainer/resources/dataset/citation/corpus/1708.00957.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/1708.00957.training.references.tei.xml index 33b5668f8e..3f848e9180 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/1708.00957.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/1708.00957.training.references.tei.xml @@ -22,8 +22,8 @@ F. Sanches and S. Weinberg, "A Holographic Entanglement Entropy Conjecture for GeneralSpacetimes ,"Phys .Rev .D 94 ,084034 ( 2016),arXiv :1603.05250[ hep-th] A. C. Wall, "Maximin Surfaces, and the Strong Subadditivity of the Covariant Holo-graphicEntanglement Entropy ",arXiv :1211.3494. R. M. Wald, General Relativity. The University of Chicago Press, Chicago, 1984. - V. E. Hubeny and M. Rangamani, "Causal Holographic Information," JHEP 1206 (2012)114 ,arXiv :1204.1698 - B. Freivogel and B. Mosk, "Properties of Causal Holographic Information", arXiv:1304.7229. + V. E. Hubeny and M. Rangamani, "Causal Holographic Information," JHEP 1206 (2012)114 ,arXiv :1204.1698 + B. Freivogel and B. Mosk, "Properties of Causal Holographic Information", arXiv:1304.7229. W. R. Kelly and A. C. Wall, "Coarse-grained entropy and causal holographic in-formationin AdS /CFT,"JHEP 1403 ,118 ( 2014)doi :10.1007/JHEP03(2014)118[ arXiv:1309.3610[hep -th]]. N. Engelhardt and A. C. Wall, "No Simple Dual to the Causal Holographic Informa-tion?,"JHEP 1704 ,134 ( 2017)doi :10.1007/JHEP04(2017)134[ arXiv:1702.01748[ hep-th]]. C. Fefferman and C. R. Graham, "Conformal invariants," Elie Cartan et les Mathema-tiquesd 'aujourd'hui,Asterisque ,p .95 ,1985 . diff --git a/grobid-trainer/resources/dataset/citation/corpus/1708.01150.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/1708.01150.training.references.tei.xml index 81bed4c725..d22c9bf6e5 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/1708.01150.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/1708.01150.training.references.tei.xml @@ -42,7 +42,7 @@ A. D. Linde, Chaotic Inflation, Phys.Lett., B129, (1983). P. J. Steinhardt, The inflation debate: Is the theory at heart of modern cos-mologydeeply flawed ?,Scientific American ,April ( 2011);A .Ijjas ,P .J .Stein -hardt,A. Loeb, Inflationaryschism after Planck2013 , Phys. Lett.B736( 2014 ).arXiv: 1402.6980 M. Gasperini, Elements of string cosmology, Cambridge University Press (2007). - I. Antoniadis, E. Gava, K. S. Narain Moduli Corrections to Gravitational Couplingsfrom String loops ,Phys .Lett.B283:209-212( 1992).arXiv :hep-th/9203071 + I. Antoniadis, E. Gava, K. S. Narain Moduli Corrections to Gravitational Couplingsfrom String loops ,Phys .Lett.B283:209-212( 1992).arXiv :hep-th/9203071 B. Zwiebach, Curvature squared terms and string theories, Phys. Lett. B 156 , 315( 1985). C.G. Callan, D. Friedan, E.J. Martinec and M.J. Perry,Strings in Background Fields,Nucl .Phys .B262 ( 1985);E .S.Fradkin and A .A.Tseytlin ,Effective field theory fromquantized strings , Phys. Lett. B158( 1985 );D. J.Grossand J . H.Sloan, Thequarticeffective action for the heterotic string , Nucl .Phys .B291 (1987 ) 41. R. P. Woodard, Avoiding Dark Energy with 1/R Modifications of Gravity, Lect.NotesPhys .,720 ( 2007). diff --git a/grobid-trainer/resources/dataset/citation/corpus/27-1708.04230.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/27-1708.04230.training.references.tei.xml index 2764aeb30c..3465346062 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/27-1708.04230.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/27-1708.04230.training.references.tei.xml @@ -56,7 +56,7 @@ C.-K. Chiu, J. C. Y. Teo, A. P. Schnyder, and S. Ryu, "Classificationof topological quantum matter with sym -metries,"Rev. Mod. Phys. ,vol. 88, p. 035005, Aug2016 . X.-Y. Dong and C.-X. Liu, "Classification of topologi-calcrystalline insulators based on representation theory ,"Phys .Rev. B, vol. 93, p. 045429, Jan2016 . H. Watanabe and L. Fu, "Topological crystalline mag-nets:Symmetry -protectedtopological phases of fermions ,"Phys .Rev. B, vol. 95, p. 081107, Feb2017 . - B. Bradlyn, L. Elcoro, J. Cano, M. G. Vergniory, Z. Wang, C.Felser ,M .I .Aroyo ,and B .A .Bernevig ," Topologicalquantum chemistry, "arXiv: 1703.02050,2017. + B. Bradlyn, L. Elcoro, J. Cano, M. G. Vergniory, Z. Wang, C.Felser ,M .I .Aroyo ,and B .A .Bernevig ," Topologicalquantum chemistry, "arXiv: 1703.02050,2017. K. Shiozaki, M. Sato, and K. Gomi, "Topological crys-tallinematerials :General formulation ,module structure ,and wallpapergroups , "Phys. Rev. B, vol. 95, p. 235425, Jun2017 . R. Yu, X. L. Qi, A. Bernevig, Z. Fang, and X. Dai, "Equiv-alentexpression of z 2 topological invariant for band insu -latorsusingthe non - abelianberryconnection , "Phys. Rev. B, vol.84 ,no .7 ,p .075119 ,2011 . L. Fidkowski, T. S. Jackson, and I. Klich, "Model charac-terizationof gapless edge modes of topological insulators using intermediatebrillouin - zonefunctions, "Phys. Rev. Lett. ,vol.107 ,p .036601 ,Jul 2011. diff --git a/grobid-trainer/resources/dataset/citation/corpus/43-2007_31_1181-1184_853674.training.references.tei.xml b/grobid-trainer/resources/dataset/citation/corpus/43-2007_31_1181-1184_853674.training.references.tei.xml index abb611418b..650b3df0f3 100644 --- a/grobid-trainer/resources/dataset/citation/corpus/43-2007_31_1181-1184_853674.training.references.tei.xml +++ b/grobid-trainer/resources/dataset/citation/corpus/43-2007_31_1181-1184_853674.training.references.tei.xml @@ -10,7 +10,7 @@ LIANG Z T, WANG X N. Phys. Rev. Lett., 2005, 94:102301 .arXiv :nucl-th/0410079;Erratum ,2006 ,96 :039901 (E) - LIANG Z T, WANG X N. Phys. Lett., 2005, B629: 20. arXiv:nucl-th/0411101 + LIANG Z T, WANG X N. Phys. Lett., 2005, B629: 20. arXiv:nucl-th/0411101 Selyuzhenkov I(STAR Collaboration). AIP Conf. Proc., 2006,870 :712 .arXiv :nucl-ex/0608034; J .Phys .,2006 ,G32 :S557. arXiv: nucl-ex/0605035 WANG X N. Phys. Lett., 2000, B485: 157 From fb04a85028ae50b914da77e9af9c169f0f0bb81a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 12 Nov 2021 17:36:35 -0800 Subject: [PATCH 3/3] add handful of citation training examples with old-style arxiv identifiers These old-style arxiv identifiers have no prefix ("arxiv:"), but are unambiguously arxiv.org identifiers. --- .../resources/dataset/citation/corpus/arxiv_old_style.xml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 grobid-trainer/resources/dataset/citation/corpus/arxiv_old_style.xml diff --git a/grobid-trainer/resources/dataset/citation/corpus/arxiv_old_style.xml b/grobid-trainer/resources/dataset/citation/corpus/arxiv_old_style.xml new file mode 100644 index 0000000000..8a35d0e529 --- /dev/null +++ b/grobid-trainer/resources/dataset/citation/corpus/arxiv_old_style.xml @@ -0,0 +1,6 @@ + + + B.A. Dobrescu, hep-ph/9510424. + K.R. Dienes, C. Kolda and J. March-Russell, hep-ph/9610479. + S.P. Martin, hep-ph/9608224. +