From b8e38ad22e76c08729d3393f06655c1b65344c3f Mon Sep 17 00:00:00 2001 From: Stephan Heunis Date: Wed, 4 Oct 2023 22:49:20 +0200 Subject: [PATCH 1/4] Updates required fields in dataset and file schemas This removes 'name' from the list of required fields of the dataset schema and adds 'metadata_sources' to the list of required fields of both the dataset and file schemas --- datalad_catalog/catalog/schema/jsonschema_dataset.json | 2 +- datalad_catalog/catalog/schema/jsonschema_file.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_catalog/catalog/schema/jsonschema_dataset.json b/datalad_catalog/catalog/schema/jsonschema_dataset.json index 642f90db..9adcca39 100644 --- a/datalad_catalog/catalog/schema/jsonschema_dataset.json +++ b/datalad_catalog/catalog/schema/jsonschema_dataset.json @@ -288,6 +288,6 @@ "maxItems": 5 } }, - "required": [ "type", "dataset_id", "dataset_version", "name"] + "required": [ "type", "dataset_id", "dataset_version", "metadata_sources"] } diff --git a/datalad_catalog/catalog/schema/jsonschema_file.json b/datalad_catalog/catalog/schema/jsonschema_file.json index 18fb3ece..3b4e803b 100644 --- a/datalad_catalog/catalog/schema/jsonschema_file.json +++ b/datalad_catalog/catalog/schema/jsonschema_file.json @@ -63,6 +63,6 @@ "uniqueItems": true } }, - "required": [ "type", "dataset_id", "dataset_version", "path"] + "required": [ "type", "dataset_id", "dataset_version", "path", "metadata_sources"] } From 74bebdd9ea7a37cb660efa88d12b58dc9155561f Mon Sep 17 00:00:00 2001 From: Stephan Heunis Date: Wed, 4 Oct 2023 22:53:29 +0200 Subject: [PATCH 2/4] Add ability get the base structure of a valid metadata item This adds the 'schema_utils' module that provides functionality for: - getting an empty dict with the exact structure of one of the schemas of a catalog (specific to a catalog or to the package itself) - getting a valid metadata item of type 'file' or 'dataset' with the minimum required fields, and populating these fields via arguments --- datalad_catalog/constants.py | 4 + datalad_catalog/schema_utils.py | 136 +++++++++++++++++++++ datalad_catalog/tests/test_schema_utils.py | 127 +++++++++++++++++++ datalad_catalog/utils.py | 13 +- 4 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 datalad_catalog/schema_utils.py create mode 100644 datalad_catalog/tests/test_schema_utils.py diff --git a/datalad_catalog/constants.py b/datalad_catalog/constants.py index fb6c3396..72a55871 100644 --- a/datalad_catalog/constants.py +++ b/datalad_catalog/constants.py @@ -24,6 +24,7 @@ DESCRIPTION = "description" DOI = "doi" DOLLARID = "$id" +DOLLARREF = "$ref" DIRECTORY = "directory" DATASET_ID = "dataset_id" DATASET_PATH = "dataset_path" @@ -44,6 +45,7 @@ EXTRACTORS_USED = "extractors_used" HASPART = "hasPart" IDENTIFIER = "identifier" +ITEMS = "items" KEY_SOURCE_MAP = "key_source_map" LOGO_PATH = "logo_path" METADATA_SOURCES = "metadata_sources" @@ -51,10 +53,12 @@ ORIGIN = "origin" PATH = "path" PERSONLIST = "#personList" +PROPERTIES = "properties" PROPERTY_SOURCES = "property_sources" PUBLICATION = "publication" PUBLICATIONS = "publications" PUBLICATIONLIST = "#publicationList" +REQUIRED = "required" SAMEAS = "sameAs" SOURCE = "source" SOURCES = "sources" diff --git a/datalad_catalog/schema_utils.py b/datalad_catalog/schema_utils.py new file mode 100644 index 00000000..0b013679 --- /dev/null +++ b/datalad_catalog/schema_utils.py @@ -0,0 +1,136 @@ +import datalad_catalog.constants as cnst +from datalad_catalog.validate import get_schema_store +from datalad_catalog.utils import get_gitconfig +from datetime import datetime + + +SCHEMA_TYPES = [ + "dataset", + "file", + "authors", + "metadata_sources", +] + + +def get_schema_item( + catalog=None, + item_type: str = "dataset", + required_only: bool = False, +): + """Returns an empty metadata item of the specified type""" + # only existing schema items are allowed + assert item_type in SCHEMA_TYPES + # get the full store for catalog (if provided) or package + store = get_schema_store(catalog) + # get the desired schema + schema = store[cnst.CATALOG_SCHEMA_IDS[item_type]] + return _schema_process_property(item_type, schema, store, required_only) + + +def _schema_process_property( + item_type: str, item: dict, store: dict, required_only: bool +): + # First, process $ref + if cnst.DOLLARREF in item: + ref = item[cnst.DOLLARREF] + if ref not in store: + return None + else: + item = store[ref] + # process type null + if cnst.TYPE not in item or not item[cnst.TYPE]: + return None + # grab type specifics + tp = item[cnst.TYPE] + rq = item.get(cnst.REQUIRED, []) + # process multiple types (prefer easiest elements) + if tp and isinstance(tp, list): + if "string" in tp: + tp = "string" + elif "number" in tp: + tp = "number" + else: + tp = tp[0] + # process type object + if tp == "object": + if cnst.PROPERTIES not in item: + return {} + new_item = {} + for key, value in item[cnst.PROPERTIES].items(): + if required_only and key not in rq: + continue + # for files and datasets, set correct 'type' + if key == cnst.TYPE: + new_item[key] = item_type + else: + new_item[key] = _schema_process_property( + item_type, value, store, required_only + ) + return new_item + # type string + if tp == "string": + return "" + # type array + if tp == "array": + if cnst.ITEMS in item: + return [ + _schema_process_property( + "", item[cnst.ITEMS], store, required_only + ) + ] + else: + return [] + # type number + if tp == "number": + return 0.0 + # if all else fails + return None + + +def get_metadata_sources(name: str, version: str, required_only: bool = False): + """Create metadata_sources dict required by catalog schema""" + metadata_sources = get_schema_item( + item_type="metadata_sources", + required_only=required_only, + ) + metadata_sources[cnst.SOURCES][0][cnst.SOURCE_NAME] = name + metadata_sources[cnst.SOURCES][0][cnst.SOURCE_VERSION] = version + if not required_only: + metadata_sources[cnst.SOURCES][0][ + cnst.SOURCE_TIME + ] = datetime.now().timestamp() + metadata_sources[cnst.SOURCES][0]["agent_email"] = get_gitconfig( + "user.email" + ) + metadata_sources[cnst.SOURCES][0]["agent_name"] = get_gitconfig( + "user.name" + ) + + return metadata_sources + + +def get_metadata_item( + item_type, + dataset_id: str, + dataset_version: str, + source_name: str, + source_version: str, + path=None, + required_only: bool = True, +): + assert item_type in ("dataset", "file") + if item_type == "file" and not path: + raise ValueError("Path is a required field for item type 'file'") + meta_item = get_schema_item( + item_type=item_type, + required_only=required_only, + ) + meta_item[cnst.DATASET_ID] = dataset_id + meta_item[cnst.DATASET_VERSION] = dataset_version + if item_type == "file": + meta_item[cnst.PATH] = path + meta_item[cnst.METADATA_SOURCES] = get_metadata_sources( + source_name, + source_version, + ) + return meta_item diff --git a/datalad_catalog/tests/test_schema_utils.py b/datalad_catalog/tests/test_schema_utils.py new file mode 100644 index 00000000..3a35e211 --- /dev/null +++ b/datalad_catalog/tests/test_schema_utils.py @@ -0,0 +1,127 @@ +from datalad_catalog.schema_utils import ( + get_schema_item, + get_metadata_item, +) + + +ds = { + "type": "dataset", + "dataset_id": "", + "dataset_version": "", + "name": "", + "short_name": "", + "description": "", + "doi": "", + "url": "", + "license": {"name": "", "url": ""}, + "authors": [ + { + "givenName": "", + "familyName": "", + "name": "", + "email": "", + "honorificSuffix": "", + "identifiers": [{"type": "", "identifier": ""}], + } + ], + "access_request_contact": { + "givenName": "", + "familyName": "", + "name": "", + "email": "", + "honorificSuffix": "", + "identifiers": [{"type": "", "identifier": ""}], + }, + "access_request_url": "", + "keywords": [""], + "funding": [{"name": "", "identifier": "", "description": ""}], + "publications": [ + { + "type": "", + "title": "", + "doi": "", + "datePublished": "", + "publicationOutlet": "", + "authors": [ + { + "givenName": "", + "familyName": "", + "name": "", + "email": "", + "honorificSuffix": "", + "identifiers": [{"type": "", "identifier": ""}], + } + ], + } + ], + "subdatasets": [ + {"dataset_id": "", "dataset_version": "", "dataset_path": ""} + ], + "metadata_sources": { + "key_source_map": {}, + "sources": [ + { + "source_name": "", + "source_version": "", + "source_parameter": {}, + "source_time": 0.0, + "agent_name": "", + "agent_email": "", + } + ], + }, + "additional_display": [{"name": "", "content": {}, "icon": ""}], + "top_display": [{"name": "", "value": ""}], +} + +fl = { + "type": "file", + "dataset_id": "", + "dataset_version": "", + "path": "", + "contentbytesize": 0.0, + "url": "", + "metadata_sources": { + "key_source_map": {}, + "sources": [ + { + "source_name": "", + "source_version": "", + "source_parameter": {}, + "source_time": 0.0, + "agent_name": "", + "agent_email": "", + } + ], + }, + "additional_display": [{"name": "", "content": {}}], +} + + +def test_get_dicts(): + tp = "dataset" + assert get_schema_item(item_type=tp) == ds + tp = "file" + assert get_schema_item(item_type=tp) == fl + + +def test_get_meta_items(): + """""" + get_metadata_item( + item_type="file", + dataset_id="my_ds_id", + dataset_version="my_ds_version", + source_name="wackystuff", + source_version="wack.point.zero", + path="testy/festy/bob.txt", + required_only=True, + ) + get_metadata_item( + item_type="dataset", + dataset_id="my_ds_id", + dataset_version="my_ds_version", + source_name="wackystuff", + source_version="wack.point.zero", + path=None, + required_only=True, + ) diff --git a/datalad_catalog/utils.py b/datalad_catalog/utils.py index 7de3950d..0e052f0e 100644 --- a/datalad_catalog/utils.py +++ b/datalad_catalog/utils.py @@ -1,8 +1,9 @@ import hashlib import json from pathlib import Path -import sys import shutil +import subprocess +import sys import yaml from datalad.support.exceptions import InsufficientArgumentsError @@ -228,3 +229,13 @@ def write_jsonline_to_file(filename, line): with open(filename, "a") as f: json.dump(line, f, cls=jsEncoder) f.write("\n") + + +def get_gitconfig(conf_name): + """Return the config of local git installation""" + result = ( + subprocess.run(["git", "config", conf_name], capture_output=True) + .stdout.decode() + .rstrip() + ) + return result From 160a22f45817c2f24e72a175fad1b837c25af0ea Mon Sep 17 00:00:00 2001 From: Stephan Heunis Date: Wed, 4 Oct 2023 23:05:58 +0200 Subject: [PATCH 3/4] Update tests after changing required schema fields in b8e38ad22e76c08729d3393f06655c1b65344c3f --- datalad_catalog/tests/test_add.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datalad_catalog/tests/test_add.py b/datalad_catalog/tests/test_add.py index f29fde90..e84aaa63 100644 --- a/datalad_catalog/tests/test_add.py +++ b/datalad_catalog/tests/test_add.py @@ -80,8 +80,8 @@ def test_add_from_file_faulty(demo_catalog, test_data): def test_add_from_stdin(monkeypatch, demo_catalog): """Add catalog metadata from stdin""" - mdata1 = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "name": "test_name"}' - mdata2 = '{"dataset_id": "3344ffv5-7a37-4062-a1e0-8fcef7909609", "dataset_version": "8888dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "name": "test_name"}' + mdata1 = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "metadata_sources": {"key_source_map": {}, "sources": [{"source_name": "", "source_version": ""}]}}' + mdata2 = '{"dataset_id": "3344ffv5-7a37-4062-a1e0-8fcef7909609", "dataset_version": "8888dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "metadata_sources": {"key_source_map": {}, "sources": [{"source_name": "", "source_version": ""}]}}' content = io.StringIO(json.dumps(mdata1) + "\n" + json.dumps(mdata2)) monkeypatch.setattr("sys.stdin", content) res = catalog_add( @@ -101,7 +101,7 @@ def test_add_from_stdin(monkeypatch, demo_catalog): def test_add_from_json_str(demo_catalog, test_data): """Add catalog metadata from a json serialized string""" - mdata = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "name": "test_name"}' + mdata = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "metadata_sources": {"key_source_map": {}, "sources": [{"source_name": "", "source_version": ""}]}}' res = catalog_add( catalog=demo_catalog, metadata=mdata, From ba656ce3897b46524d2f82f9a30bd999d1a14a73 Mon Sep 17 00:00:00 2001 From: Stephan Heunis Date: Wed, 4 Oct 2023 23:15:32 +0200 Subject: [PATCH 4/4] fix spelling errors --- datalad_catalog/translate.py | 2 +- docs/source/metadata_formats.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_catalog/translate.py b/datalad_catalog/translate.py index b7401c89..52d305ee 100644 --- a/datalad_catalog/translate.py +++ b/datalad_catalog/translate.py @@ -68,7 +68,7 @@ class MetaTranslate(ValidatedInterface): The to-be-translated-to schema version is determined from the catalog, if provided, otherwise from the latest supported version of the package installation. - Tranlators should be provided and exposed as a datalad entry point using the group: + Translators should be provided and exposed as a datalad entry point using the group: 'datalad.metadata.translators'. Available translators will be filtered based on own matching criteria (such as diff --git a/docs/source/metadata_formats.rst b/docs/source/metadata_formats.rst index fb7d539f..193d91ee 100644 --- a/docs/source/metadata_formats.rst +++ b/docs/source/metadata_formats.rst @@ -96,7 +96,7 @@ implement custom translators. Before translation from a specific source will wor the extractor-specific translator should be provided and exposed as an entry point (via a DataLad extension) as part of the ``datalad.metadata.translators`` group. -Then ``datalad-catalog`` will be able to find the correct traslator automatically +Then ``datalad-catalog`` will be able to find the correct translator automatically based on unique properties in a MetaLad-extracted metadata object. This is done by applying matching criteria that is specified by the translator, and running a ``translate()`` method if the match was successful.