From b8e38ad22e76c08729d3393f06655c1b65344c3f Mon Sep 17 00:00:00 2001
From: Stephan Heunis <s.heunis@fz-juelich.de>
Date: Wed, 4 Oct 2023 22:49:20 +0200
Subject: [PATCH 1/4] Updates required fields in dataset and file schemas

This removes 'name' from the list of required fields of the dataset schema
and adds 'metadata_sources' to the list of required fields of both the
dataset and file schemas
---
 datalad_catalog/catalog/schema/jsonschema_dataset.json | 2 +-
 datalad_catalog/catalog/schema/jsonschema_file.json    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/datalad_catalog/catalog/schema/jsonschema_dataset.json b/datalad_catalog/catalog/schema/jsonschema_dataset.json
index 642f90db..9adcca39 100644
--- a/datalad_catalog/catalog/schema/jsonschema_dataset.json
+++ b/datalad_catalog/catalog/schema/jsonschema_dataset.json
@@ -288,6 +288,6 @@
       "maxItems": 5
     }
   },
-  "required": [ "type", "dataset_id", "dataset_version", "name"]
+  "required": [ "type", "dataset_id", "dataset_version", "metadata_sources"]
 }
 
diff --git a/datalad_catalog/catalog/schema/jsonschema_file.json b/datalad_catalog/catalog/schema/jsonschema_file.json
index 18fb3ece..3b4e803b 100644
--- a/datalad_catalog/catalog/schema/jsonschema_file.json
+++ b/datalad_catalog/catalog/schema/jsonschema_file.json
@@ -63,6 +63,6 @@
       "uniqueItems": true
     }
   },
-  "required": [ "type", "dataset_id", "dataset_version", "path"]
+  "required": [ "type", "dataset_id", "dataset_version", "path", "metadata_sources"]
 }
 

From 74bebdd9ea7a37cb660efa88d12b58dc9155561f Mon Sep 17 00:00:00 2001
From: Stephan Heunis <s.heunis@fz-juelich.de>
Date: Wed, 4 Oct 2023 22:53:29 +0200
Subject: [PATCH 2/4] Add ability get the base structure of a valid metadata
 item

This adds the 'schema_utils' module that provides functionality for:
- getting an empty dict with the exact structure of one of the schemas
of a catalog (specific to a catalog or to the package itself)
- getting a valid metadata item of type 'file' or 'dataset' with the
minimum required fields, and populating these fields via arguments
---
 datalad_catalog/constants.py               |   4 +
 datalad_catalog/schema_utils.py            | 136 +++++++++++++++++++++
 datalad_catalog/tests/test_schema_utils.py | 127 +++++++++++++++++++
 datalad_catalog/utils.py                   |  13 +-
 4 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 datalad_catalog/schema_utils.py
 create mode 100644 datalad_catalog/tests/test_schema_utils.py

diff --git a/datalad_catalog/constants.py b/datalad_catalog/constants.py
index fb6c3396..72a55871 100644
--- a/datalad_catalog/constants.py
+++ b/datalad_catalog/constants.py
@@ -24,6 +24,7 @@
 DESCRIPTION = "description"
 DOI = "doi"
 DOLLARID = "$id"
+DOLLARREF = "$ref"
 DIRECTORY = "directory"
 DATASET_ID = "dataset_id"
 DATASET_PATH = "dataset_path"
@@ -44,6 +45,7 @@
 EXTRACTORS_USED = "extractors_used"
 HASPART = "hasPart"
 IDENTIFIER = "identifier"
+ITEMS = "items"
 KEY_SOURCE_MAP = "key_source_map"
 LOGO_PATH = "logo_path"
 METADATA_SOURCES = "metadata_sources"
@@ -51,10 +53,12 @@
 ORIGIN = "origin"
 PATH = "path"
 PERSONLIST = "#personList"
+PROPERTIES = "properties"
 PROPERTY_SOURCES = "property_sources"
 PUBLICATION = "publication"
 PUBLICATIONS = "publications"
 PUBLICATIONLIST = "#publicationList"
+REQUIRED = "required"
 SAMEAS = "sameAs"
 SOURCE = "source"
 SOURCES = "sources"
diff --git a/datalad_catalog/schema_utils.py b/datalad_catalog/schema_utils.py
new file mode 100644
index 00000000..0b013679
--- /dev/null
+++ b/datalad_catalog/schema_utils.py
@@ -0,0 +1,136 @@
+import datalad_catalog.constants as cnst
+from datalad_catalog.validate import get_schema_store
+from datalad_catalog.utils import get_gitconfig
+from datetime import datetime
+
+
+SCHEMA_TYPES = [
+    "dataset",
+    "file",
+    "authors",
+    "metadata_sources",
+]
+
+
+def get_schema_item(
+    catalog=None,
+    item_type: str = "dataset",
+    required_only: bool = False,
+):
+    """Returns an empty metadata item of the specified type"""
+    # only existing schema items are allowed
+    assert item_type in SCHEMA_TYPES
+    # get the full store for catalog (if provided) or package
+    store = get_schema_store(catalog)
+    # get the desired schema
+    schema = store[cnst.CATALOG_SCHEMA_IDS[item_type]]
+    return _schema_process_property(item_type, schema, store, required_only)
+
+
+def _schema_process_property(
+    item_type: str, item: dict, store: dict, required_only: bool
+):
+    # First, process $ref
+    if cnst.DOLLARREF in item:
+        ref = item[cnst.DOLLARREF]
+        if ref not in store:
+            return None
+        else:
+            item = store[ref]
+    # process type null
+    if cnst.TYPE not in item or not item[cnst.TYPE]:
+        return None
+    # grab type specifics
+    tp = item[cnst.TYPE]
+    rq = item.get(cnst.REQUIRED, [])
+    # process multiple types (prefer easiest elements)
+    if tp and isinstance(tp, list):
+        if "string" in tp:
+            tp = "string"
+        elif "number" in tp:
+            tp = "number"
+        else:
+            tp = tp[0]
+    # process type object
+    if tp == "object":
+        if cnst.PROPERTIES not in item:
+            return {}
+        new_item = {}
+        for key, value in item[cnst.PROPERTIES].items():
+            if required_only and key not in rq:
+                continue
+            # for files and datasets, set correct 'type'
+            if key == cnst.TYPE:
+                new_item[key] = item_type
+            else:
+                new_item[key] = _schema_process_property(
+                    item_type, value, store, required_only
+                )
+        return new_item
+    # type string
+    if tp == "string":
+        return ""
+    # type array
+    if tp == "array":
+        if cnst.ITEMS in item:
+            return [
+                _schema_process_property(
+                    "", item[cnst.ITEMS], store, required_only
+                )
+            ]
+        else:
+            return []
+    # type number
+    if tp == "number":
+        return 0.0
+    # if all else fails
+    return None
+
+
+def get_metadata_sources(name: str, version: str, required_only: bool = False):
+    """Create metadata_sources dict required by catalog schema"""
+    metadata_sources = get_schema_item(
+        item_type="metadata_sources",
+        required_only=required_only,
+    )
+    metadata_sources[cnst.SOURCES][0][cnst.SOURCE_NAME] = name
+    metadata_sources[cnst.SOURCES][0][cnst.SOURCE_VERSION] = version
+    if not required_only:
+        metadata_sources[cnst.SOURCES][0][
+            cnst.SOURCE_TIME
+        ] = datetime.now().timestamp()
+        metadata_sources[cnst.SOURCES][0]["agent_email"] = get_gitconfig(
+            "user.email"
+        )
+        metadata_sources[cnst.SOURCES][0]["agent_name"] = get_gitconfig(
+            "user.name"
+        )
+
+    return metadata_sources
+
+
+def get_metadata_item(
+    item_type,
+    dataset_id: str,
+    dataset_version: str,
+    source_name: str,
+    source_version: str,
+    path=None,
+    required_only: bool = True,
+):
+    assert item_type in ("dataset", "file")
+    if item_type == "file" and not path:
+        raise ValueError("Path is a required field for item type 'file'")
+    meta_item = get_schema_item(
+        item_type=item_type,
+        required_only=required_only,
+    )
+    meta_item[cnst.DATASET_ID] = dataset_id
+    meta_item[cnst.DATASET_VERSION] = dataset_version
+    if item_type == "file":
+        meta_item[cnst.PATH] = path
+    meta_item[cnst.METADATA_SOURCES] = get_metadata_sources(
+        source_name,
+        source_version,
+    )
+    return meta_item
diff --git a/datalad_catalog/tests/test_schema_utils.py b/datalad_catalog/tests/test_schema_utils.py
new file mode 100644
index 00000000..3a35e211
--- /dev/null
+++ b/datalad_catalog/tests/test_schema_utils.py
@@ -0,0 +1,127 @@
+from datalad_catalog.schema_utils import (
+    get_schema_item,
+    get_metadata_item,
+)
+
+
+ds = {
+    "type": "dataset",
+    "dataset_id": "",
+    "dataset_version": "",
+    "name": "",
+    "short_name": "",
+    "description": "",
+    "doi": "",
+    "url": "",
+    "license": {"name": "", "url": ""},
+    "authors": [
+        {
+            "givenName": "",
+            "familyName": "",
+            "name": "",
+            "email": "",
+            "honorificSuffix": "",
+            "identifiers": [{"type": "", "identifier": ""}],
+        }
+    ],
+    "access_request_contact": {
+        "givenName": "",
+        "familyName": "",
+        "name": "",
+        "email": "",
+        "honorificSuffix": "",
+        "identifiers": [{"type": "", "identifier": ""}],
+    },
+    "access_request_url": "",
+    "keywords": [""],
+    "funding": [{"name": "", "identifier": "", "description": ""}],
+    "publications": [
+        {
+            "type": "",
+            "title": "",
+            "doi": "",
+            "datePublished": "",
+            "publicationOutlet": "",
+            "authors": [
+                {
+                    "givenName": "",
+                    "familyName": "",
+                    "name": "",
+                    "email": "",
+                    "honorificSuffix": "",
+                    "identifiers": [{"type": "", "identifier": ""}],
+                }
+            ],
+        }
+    ],
+    "subdatasets": [
+        {"dataset_id": "", "dataset_version": "", "dataset_path": ""}
+    ],
+    "metadata_sources": {
+        "key_source_map": {},
+        "sources": [
+            {
+                "source_name": "",
+                "source_version": "",
+                "source_parameter": {},
+                "source_time": 0.0,
+                "agent_name": "",
+                "agent_email": "",
+            }
+        ],
+    },
+    "additional_display": [{"name": "", "content": {}, "icon": ""}],
+    "top_display": [{"name": "", "value": ""}],
+}
+
+fl = {
+    "type": "file",
+    "dataset_id": "",
+    "dataset_version": "",
+    "path": "",
+    "contentbytesize": 0.0,
+    "url": "",
+    "metadata_sources": {
+        "key_source_map": {},
+        "sources": [
+            {
+                "source_name": "",
+                "source_version": "",
+                "source_parameter": {},
+                "source_time": 0.0,
+                "agent_name": "",
+                "agent_email": "",
+            }
+        ],
+    },
+    "additional_display": [{"name": "", "content": {}}],
+}
+
+
+def test_get_dicts():
+    tp = "dataset"
+    assert get_schema_item(item_type=tp) == ds
+    tp = "file"
+    assert get_schema_item(item_type=tp) == fl
+
+
+def test_get_meta_items():
+    """"""
+    get_metadata_item(
+        item_type="file",
+        dataset_id="my_ds_id",
+        dataset_version="my_ds_version",
+        source_name="wackystuff",
+        source_version="wack.point.zero",
+        path="testy/festy/bob.txt",
+        required_only=True,
+    )
+    get_metadata_item(
+        item_type="dataset",
+        dataset_id="my_ds_id",
+        dataset_version="my_ds_version",
+        source_name="wackystuff",
+        source_version="wack.point.zero",
+        path=None,
+        required_only=True,
+    )
diff --git a/datalad_catalog/utils.py b/datalad_catalog/utils.py
index 7de3950d..0e052f0e 100644
--- a/datalad_catalog/utils.py
+++ b/datalad_catalog/utils.py
@@ -1,8 +1,9 @@
 import hashlib
 import json
 from pathlib import Path
-import sys
 import shutil
+import subprocess
+import sys
 import yaml
 
 from datalad.support.exceptions import InsufficientArgumentsError
@@ -228,3 +229,13 @@ def write_jsonline_to_file(filename, line):
     with open(filename, "a") as f:
         json.dump(line, f, cls=jsEncoder)
         f.write("\n")
+
+
+def get_gitconfig(conf_name):
+    """Return the config of local git installation"""
+    result = (
+        subprocess.run(["git", "config", conf_name], capture_output=True)
+        .stdout.decode()
+        .rstrip()
+    )
+    return result

From 160a22f45817c2f24e72a175fad1b837c25af0ea Mon Sep 17 00:00:00 2001
From: Stephan Heunis <s.heunis@fz-juelich.de>
Date: Wed, 4 Oct 2023 23:05:58 +0200
Subject: [PATCH 3/4] Update tests after changing required schema fields in
 b8e38ad22e76c08729d3393f06655c1b65344c3f

---
 datalad_catalog/tests/test_add.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datalad_catalog/tests/test_add.py b/datalad_catalog/tests/test_add.py
index f29fde90..e84aaa63 100644
--- a/datalad_catalog/tests/test_add.py
+++ b/datalad_catalog/tests/test_add.py
@@ -80,8 +80,8 @@ def test_add_from_file_faulty(demo_catalog, test_data):
 
 def test_add_from_stdin(monkeypatch, demo_catalog):
     """Add catalog metadata from stdin"""
-    mdata1 = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "name": "test_name"}'
-    mdata2 = '{"dataset_id": "3344ffv5-7a37-4062-a1e0-8fcef7909609", "dataset_version": "8888dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "name": "test_name"}'
+    mdata1 = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "metadata_sources": {"key_source_map": {}, "sources": [{"source_name": "", "source_version": ""}]}}'
+    mdata2 = '{"dataset_id": "3344ffv5-7a37-4062-a1e0-8fcef7909609", "dataset_version": "8888dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "metadata_sources": {"key_source_map": {}, "sources": [{"source_name": "", "source_version": ""}]}}'
     content = io.StringIO(json.dumps(mdata1) + "\n" + json.dumps(mdata2))
     monkeypatch.setattr("sys.stdin", content)
     res = catalog_add(
@@ -101,7 +101,7 @@ def test_add_from_stdin(monkeypatch, demo_catalog):
 
 def test_add_from_json_str(demo_catalog, test_data):
     """Add catalog metadata from a json serialized string"""
-    mdata = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "name": "test_name"}'
+    mdata = '{"dataset_id": "deabeb9b-7a37-4062-a1e0-8fcef7909609", "dataset_version": "0321dbde969d2f5d6b533e35b5c5c51ac0b15758", "type": "dataset", "metadata_sources": {"key_source_map": {}, "sources": [{"source_name": "", "source_version": ""}]}}'
     res = catalog_add(
         catalog=demo_catalog,
         metadata=mdata,

From ba656ce3897b46524d2f82f9a30bd999d1a14a73 Mon Sep 17 00:00:00 2001
From: Stephan Heunis <s.heunis@fz-juelich.de>
Date: Wed, 4 Oct 2023 23:15:32 +0200
Subject: [PATCH 4/4] fix spelling errors

---
 datalad_catalog/translate.py     | 2 +-
 docs/source/metadata_formats.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/datalad_catalog/translate.py b/datalad_catalog/translate.py
index b7401c89..52d305ee 100644
--- a/datalad_catalog/translate.py
+++ b/datalad_catalog/translate.py
@@ -68,7 +68,7 @@ class MetaTranslate(ValidatedInterface):
     The to-be-translated-to schema version is determined from the catalog,
     if provided, otherwise from the latest supported version of the package installation.
 
-    Tranlators should be provided and exposed as a datalad entry point using the group:
+    Translators should be provided and exposed as a datalad entry point using the group:
     'datalad.metadata.translators'.
 
     Available translators will be filtered based on own matching criteria (such as
diff --git a/docs/source/metadata_formats.rst b/docs/source/metadata_formats.rst
index fb7d539f..193d91ee 100644
--- a/docs/source/metadata_formats.rst
+++ b/docs/source/metadata_formats.rst
@@ -96,7 +96,7 @@ implement custom translators. Before translation from a specific source will wor
 the extractor-specific translator should be provided and exposed as an entry point
 (via a DataLad extension) as part of the ``datalad.metadata.translators`` group.
 
-Then ``datalad-catalog`` will be able to find the correct traslator automatically
+Then ``datalad-catalog`` will be able to find the correct translator automatically
 based on unique properties in a MetaLad-extracted metadata object. This is done by applying
 matching criteria that is specified by the translator, and running a ``translate()`` method
 if the match was successful.