Skip to content

Commit

Permalink
Merge pull request #440 from MJedr/arxiv-validation
Browse files Browse the repository at this point in the history
arxiv: enable doi format in references
  • Loading branch information
MJedr authored Feb 2, 2023
2 parents e9d64cb + edb809b commit 32d9168
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 8 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build-and-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:

jobs:
Test:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [2.7, 3.6]
Expand Down Expand Up @@ -39,7 +39,7 @@ jobs:
- name: Install python dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel
pip install setuptools wheel "urllib3==1.25.11"
pip install -e .[tests,docs]
- name: Show python dependencies
Expand Down
39 changes: 33 additions & 6 deletions inspire_schemas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,37 @@

# Matches new style arXiv ID, with an old-style class specification
# (Malformed, but appears in APS records)
RE_ARXIV_POST_2007 = r"((?P<category>(?:[a-z-]+)(?:\.[a-z]{2})?)/)?(?P<identifier>\d{4}\.\d{4,5})(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$"
RE_ARXIV_POST_2007_CLASS = re.compile(
r"(arxiv:)?((?P<category>(?:[a-z-]+)(?:\.[a-z]{2})?)/)?(?P<identifier>\d{4}\.\d{4,5})(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$",
r"(arxiv:)?{}".format(RE_ARXIV_POST_2007),
flags=re.I
)

RE_ARXIV_PRE_2007 = r"(?P<category>(?P<extraidentifier>[a-z-]+)(?:\.[a-z]{2})?)/(?P<identifier>\d{4}\d+)(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$"
RE_ARXIV_PRE_2007_CLASS = re.compile(
r"(arxiv:)?(?P<category>(?P<extraidentifier>[a-z-]+)(?:\.[a-z]{2})?)/(?P<identifier>\d{4}\d+)(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$",
r"(arxiv:)?{}".format(RE_ARXIV_PRE_2007),
flags=re.I
)

RE_ARXIV_DOI_POST_2007_CLASS = re.compile(r"10.48550/arXiv.{}".format(RE_ARXIV_POST_2007), re.I)
RE_ARXIV_DOI_PRE_2007_CLASS = re.compile(r"10.48550/arXiv.{}".format(RE_ARXIV_PRE_2007), re.I)
RE_ARXIV_URL_PRE_2007_CLASS = re.compile(r"https?://arXiv.org/(abs|pdf)/{}.*".format(RE_ARXIV_PRE_2007), re.I)
RE_ARXIV_URL_POST_2007_CLASS = re.compile(r"https?://arXiv.org/(abs|pdf)/{}.*".format(RE_ARXIV_POST_2007), re.I)

ARXIV_PATTERNS_PRE_2007 = [
RE_ARXIV_PRE_2007_CLASS,
RE_ARXIV_DOI_PRE_2007_CLASS,
RE_ARXIV_URL_PRE_2007_CLASS,
]

ARXIV_PATTERNS_POST_2007 = [
RE_ARXIV_POST_2007_CLASS,
RE_ARXIV_DOI_POST_2007_CLASS,
RE_ARXIV_URL_POST_2007_CLASS,
]

ARXIV_PATTERNS = ARXIV_PATTERNS_PRE_2007 + ARXIV_PATTERNS_POST_2007

JOURNALS_IGNORED_IN_OLD_TO_NEW = [
'econf',
]
Expand Down Expand Up @@ -1064,6 +1085,13 @@ def normalize_isbn(isbn):
return isbn


def _get_first_regex_match(regex_list, obj_to_match):
for regex in regex_list:
match = regex.match(obj_to_match)
if match:
return match


def is_arxiv(obj):
"""Return ``True`` if ``obj`` contains an arXiv identifier.
Expand All @@ -1074,8 +1102,7 @@ def is_arxiv(obj):
if not arxiv_test:
return False

matched_arxiv = (RE_ARXIV_PRE_2007_CLASS.match(arxiv_test[0]) or
RE_ARXIV_POST_2007_CLASS.match(arxiv_test[0]))
matched_arxiv = _get_first_regex_match(ARXIV_PATTERNS, arxiv_test[0])

if not matched_arxiv:
return False
Expand All @@ -1093,11 +1120,11 @@ def normalize_arxiv(obj):
"""Return a normalized arXiv identifier from ``obj``."""
obj = obj.split()[0]

matched_arxiv_pre = RE_ARXIV_PRE_2007_CLASS.match(obj)
matched_arxiv_pre = _get_first_regex_match(ARXIV_PATTERNS_PRE_2007, obj)
if matched_arxiv_pre:
return ('/'.join(matched_arxiv_pre.group("extraidentifier", "identifier"))).lower()

matched_arxiv_post = RE_ARXIV_POST_2007_CLASS.match(obj)
matched_arxiv_post = _get_first_regex_match(ARXIV_PATTERNS_POST_2007, obj)
if matched_arxiv_post:
return matched_arxiv_post.group("identifier")

Expand Down
52 changes: 52 additions & 0 deletions tests/unit/test_reference_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,3 +1399,55 @@ def test_reference_builder_is_not_adding_doi_when_already_present():
rb.add_uid('10.1088/1009-0630/7/4/022')

assert rb.obj['reference']['dois'] == ['10.1088/1009-0630/7/4/022']


def test_reference_builder_adds_arxiv_in_doi_format_pre_2007():
rb = ReferenceBuilder()
rb.add_uid('10.48550/arXiv.hep-th/050502')

expected = {
'reference': {
'arxiv_eprint': 'hep-th/050502'
},
}

assert rb.obj == expected


def test_reference_builder_adds_arxiv_in_doi_format_post_2007():
rb = ReferenceBuilder()
rb.add_uid('10.48550/arXiv.2212.07286')

expected = {
'reference': {
'arxiv_eprint': '2212.07286'
},
}

assert rb.obj == expected


def test_reference_builder_adds_arxiv_url_pre_2007():
rb = ReferenceBuilder()
rb.add_uid('https://arXiv.org/abs/hep-th/050502')

expected = {
'reference': {
'arxiv_eprint': 'hep-th/050502'
},
}

assert rb.obj == expected


def test_reference_builder_adds_arxiv_url_pre_2007():
rb = ReferenceBuilder()
rb.add_uid('https://arXiv.org/pdf/2212.07286')

expected = {
'reference': {
'arxiv_eprint': '2212.07286'
},
}

assert rb.obj == expected

0 comments on commit 32d9168

Please sign in to comment.