From edb809b25deed07f224883b7d9708321e32e8b96 Mon Sep 17 00:00:00 2001 From: MJedr Date: Tue, 31 Jan 2023 16:25:33 +0100 Subject: [PATCH] arxiv: enable doi format in references ref: cern-sis/issues-inspire#218 --- .github/workflows/build-and-release.yml | 4 +- inspire_schemas/utils.py | 39 ++++++++++++++++--- tests/unit/test_reference_builder.py | 52 +++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index e13d6538..09ecae53 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -8,7 +8,7 @@ on: jobs: Test: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 strategy: matrix: python-version: [2.7, 3.6] @@ -39,7 +39,7 @@ jobs: - name: Install python dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel + pip install setuptools wheel "urllib3==1.25.11" pip install -e .[tests,docs] - name: Show python dependencies diff --git a/inspire_schemas/utils.py b/inspire_schemas/utils.py index 8759dda2..20b9f7c1 100644 --- a/inspire_schemas/utils.py +++ b/inspire_schemas/utils.py @@ -87,16 +87,37 @@ # Matches new style arXiv ID, with an old-style class specification # (Malformed, but appears in APS records) +RE_ARXIV_POST_2007 = r"((?P(?:[a-z-]+)(?:\.[a-z]{2})?)/)?(?P\d{4}\.\d{4,5})(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$" RE_ARXIV_POST_2007_CLASS = re.compile( - r"(arxiv:)?((?P(?:[a-z-]+)(?:\.[a-z]{2})?)/)?(?P\d{4}\.\d{4,5})(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$", + r"(arxiv:)?{}".format(RE_ARXIV_POST_2007), flags=re.I ) +RE_ARXIV_PRE_2007 = r"(?P(?P[a-z-]+)(?:\.[a-z]{2})?)/(?P\d{4}\d+)(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$" RE_ARXIV_PRE_2007_CLASS = re.compile( - r"(arxiv:)?(?P(?P[a-z-]+)(?:\.[a-z]{2})?)/(?P\d{4}\d+)(v\d+)?\s*(\[(?:[a-z\-\.]+)\])?$", + r"(arxiv:)?{}".format(RE_ARXIV_PRE_2007), flags=re.I ) +RE_ARXIV_DOI_POST_2007_CLASS = re.compile(r"10.48550/arXiv.{}".format(RE_ARXIV_POST_2007), re.I) +RE_ARXIV_DOI_PRE_2007_CLASS = re.compile(r"10.48550/arXiv.{}".format(RE_ARXIV_PRE_2007), re.I) +RE_ARXIV_URL_PRE_2007_CLASS = re.compile(r"https?://arXiv.org/(abs|pdf)/{}.*".format(RE_ARXIV_PRE_2007), re.I) +RE_ARXIV_URL_POST_2007_CLASS = re.compile(r"https?://arXiv.org/(abs|pdf)/{}.*".format(RE_ARXIV_POST_2007), re.I) + +ARXIV_PATTERNS_PRE_2007 = [ + RE_ARXIV_PRE_2007_CLASS, + RE_ARXIV_DOI_PRE_2007_CLASS, + RE_ARXIV_URL_PRE_2007_CLASS, +] + +ARXIV_PATTERNS_POST_2007 = [ + RE_ARXIV_POST_2007_CLASS, + RE_ARXIV_DOI_POST_2007_CLASS, + RE_ARXIV_URL_POST_2007_CLASS, +] + +ARXIV_PATTERNS = ARXIV_PATTERNS_PRE_2007 + ARXIV_PATTERNS_POST_2007 + JOURNALS_IGNORED_IN_OLD_TO_NEW = [ 'econf', ] @@ -1064,6 +1085,13 @@ def normalize_isbn(isbn): return isbn +def _get_first_regex_match(regex_list, obj_to_match): + for regex in regex_list: + match = regex.match(obj_to_match) + if match: + return match + + def is_arxiv(obj): """Return ``True`` if ``obj`` contains an arXiv identifier. @@ -1074,8 +1102,7 @@ def is_arxiv(obj): if not arxiv_test: return False - matched_arxiv = (RE_ARXIV_PRE_2007_CLASS.match(arxiv_test[0]) or - RE_ARXIV_POST_2007_CLASS.match(arxiv_test[0])) + matched_arxiv = _get_first_regex_match(ARXIV_PATTERNS, arxiv_test[0]) if not matched_arxiv: return False @@ -1093,11 +1120,11 @@ def normalize_arxiv(obj): """Return a normalized arXiv identifier from ``obj``.""" obj = obj.split()[0] - matched_arxiv_pre = RE_ARXIV_PRE_2007_CLASS.match(obj) + matched_arxiv_pre = _get_first_regex_match(ARXIV_PATTERNS_PRE_2007, obj) if matched_arxiv_pre: return ('/'.join(matched_arxiv_pre.group("extraidentifier", "identifier"))).lower() - matched_arxiv_post = RE_ARXIV_POST_2007_CLASS.match(obj) + matched_arxiv_post = _get_first_regex_match(ARXIV_PATTERNS_POST_2007, obj) if matched_arxiv_post: return matched_arxiv_post.group("identifier") diff --git a/tests/unit/test_reference_builder.py b/tests/unit/test_reference_builder.py index 91cca834..5b964152 100644 --- a/tests/unit/test_reference_builder.py +++ b/tests/unit/test_reference_builder.py @@ -1399,3 +1399,55 @@ def test_reference_builder_is_not_adding_doi_when_already_present(): rb.add_uid('10.1088/1009-0630/7/4/022') assert rb.obj['reference']['dois'] == ['10.1088/1009-0630/7/4/022'] + + +def test_reference_builder_adds_arxiv_in_doi_format_pre_2007(): + rb = ReferenceBuilder() + rb.add_uid('10.48550/arXiv.hep-th/050502') + + expected = { + 'reference': { + 'arxiv_eprint': 'hep-th/050502' + }, + } + + assert rb.obj == expected + + +def test_reference_builder_adds_arxiv_in_doi_format_post_2007(): + rb = ReferenceBuilder() + rb.add_uid('10.48550/arXiv.2212.07286') + + expected = { + 'reference': { + 'arxiv_eprint': '2212.07286' + }, + } + + assert rb.obj == expected + + +def test_reference_builder_adds_arxiv_url_pre_2007(): + rb = ReferenceBuilder() + rb.add_uid('https://arXiv.org/abs/hep-th/050502') + + expected = { + 'reference': { + 'arxiv_eprint': 'hep-th/050502' + }, + } + + assert rb.obj == expected + + +def test_reference_builder_adds_arxiv_url_pre_2007(): + rb = ReferenceBuilder() + rb.add_uid('https://arXiv.org/pdf/2212.07286') + + expected = { + 'reference': { + 'arxiv_eprint': '2212.07286' + }, + } + + assert rb.obj == expected