Merge branch 'main' into colab

matthiasprobst · Jan 4, 2024 · 57cf28a · 57cf28a
2 parents 593e4b7 + cc7078b
commit 57cf28a
Show file tree

Hide file tree

Showing 10 changed files with 335 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 Log of changes in the versions
 
+## v1.2.1
+- Add codemeta namespace
+- Improved json-ld export
+- Updated qudt namespace
+- colab notebook will be managed on a separate branch. the readme link points to the branch
+
 ## v1.2.0
 - Improved assignment of IRI to attributes
 - Export of a JSON-LD file possible

diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ Current implementation highlights in the modules:
 
 A quickstart notebook can be tested by clicking on the following badge:
 
-[![Open Quickstart Notebook](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/matthiasprobst/h5RDMtoolbox/blob/main/docs/colab/quickstart.ipynb)
+[![Open Quickstart Notebook](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/matthiasprobst/h5RDMtoolbox/blob/colab/docs/colab/quickstart.ipynb)
 
 ## Documentation
 

diff --git a/codemeta.json b/codemeta.json
@@ -4,7 +4,7 @@
     "license": "https://spdx.org/licenses/MIT",
     "codeRepository": "git+https://github.com/matthiasprobst/h5RDMtoolbox.git",
     "name": "h5RDMtoolbox",
-    "version": "1.2.2",
+    "version": "1.2.1",
     "description": "Supporting a FAIR Research Data lifecycle using Python and HDF5.",
     "applicationCategory": "Engineering",
     "programmingLanguage": [

diff --git a/h5rdmtoolbox/namespace.py b/h5rdmtoolbox/namespace.py
@@ -1,6 +1,7 @@
+from .wrapper.namespaces._codemeta_namespace import CODEMETA
 from .wrapper.namespaces._m4i_namespace import M4I
 from .wrapper.namespaces._obo_namespace import OBO
 from .wrapper.namespaces._qudt_quantitykind_namespace import QUDT_QUANTITYKIND
 from .wrapper.namespaces._qudt_unit_namespace import QUDT_UNIT
 
-__all__ = ['M4I', 'OBO', 'QUDT_UNIT', 'QUDT_QUANTITYKIND']
+__all__ = ['M4I', 'OBO', 'QUDT_UNIT', 'QUDT_QUANTITYKIND', 'CODEMETA']
diff --git a/h5rdmtoolbox/wrapper/h5attr.py b/h5rdmtoolbox/wrapper/h5attr.py
@@ -1,6 +1,7 @@
 import ast
 import h5py
 import json
+import numpy as np
 import pint
 import rdflib
 from h5py._hl.base import with_phil
@@ -110,10 +111,13 @@ def _parse_return_value(_id, ret):
                     # might be a list object
                     try:
                         return ast.literal_eval(ret)
+                        # return ast.literal_eval(ret.replace(' ', ', '))
                     except (ValueError, NameError, AttributeError):
                         return ret
                 return ret
             return AttributeString(ret)
+        if isinstance(ret, np.ndarray) and ret.dtype.name == 'object':
+            return WrapperAttributeManager._parse_return_value(_id, str(ret.tolist()))
         return ret
 
     @with_phil
@@ -132,8 +136,8 @@ def create(self,
                name,
                data,
                shape=None, dtype=None,
-               predicate: Union[str, rdflib.URIRef]=None,
-               object: Union[str, rdflib.URIRef]=None):
+               predicate: Union[str, rdflib.URIRef] = None,
+               object: Union[str, rdflib.URIRef] = None):
         r = super().create(name,
                            utils.parse_object_for_attribute_setting(data),
                            shape, dtype)
@@ -286,7 +290,6 @@ def raw(self) -> "h5py.AttributeManager":
         with phil:
             return attrs.AttributeManager(self._parent)
 
-
 # class IRIAttr:
 #     """Helper class to write attributes together with an IRI
 #

diff --git a/h5rdmtoolbox/wrapper/jsonld.py b/h5rdmtoolbox/wrapper/jsonld.py
@@ -4,27 +4,56 @@
 from typing import Dict
 
 
+def _merge_entries(entries: Dict, clean: bool = True) -> Dict:
+    _entries = entries.copy()
+
+    ids = list(entries.keys())
+
+    delete_candidates = []
+
+    for _id, entry in entries.items():
+        for k, v in entry.items():
+            if clean and len(entry) == 1:
+                # remove empty entry, Note, this could be a problem if the entry references elsewhere...
+                delete_candidates.append(_id)
+                continue
+            if k != '@id':
+                if isinstance(v, list):
+                    if all([i in ids for i in v]):
+                        _entries[_id][k] = [_entries.pop(i) for i in v]
+
+                elif v in ids:
+                    _entries[_id][k] = _entries.pop(v)
+    if clean:
+        for dc in delete_candidates:
+            _entries.pop(dc, None)
+    return _entries
+
+
 def dumpd(grp,
           iri_only=False,
-          file_url="file://./",
-          recursive: bool = True) -> Dict:
+          file_url="",
+          recursive: bool = True,
+          compact: bool = False) -> Dict:
     """Dump a group or a dataset to to dict."""
 
     if isinstance(grp, (str, pathlib.Path)):
         from .core import File
         with File(grp) as h5:
-            return dumpd(h5, iri_only, file_url, recursive=recursive)
+            return dumpd(h5, iri_only, file_url, recursive=recursive, compact=compact)
 
     assert isinstance(grp, (h5py.Group, h5py.Dataset))
 
     def _get_id(_grp):
-        stem = pathlib.Path(_grp.file.filename).stem
-        return file_url + stem + _grp.name
+        return file_url + 'grp:' + _grp.name
 
-    entries = []
+    entries = {}
 
     def _get_dict(_name: str, node):
-        j = {"@id": _get_id(node)}
+        _id = node.attrs.get('@id', None)
+        if _id is None:
+            _id = _get_id(node)
+        j = {"@id": _id}
         s = node.iri.subject
         if s is not None:
             j["@type"] = str(s)
@@ -35,37 +64,70 @@ def _get_dict(_name: str, node):
                         value = str(node.iri.object[k])
                     else:
                         if isinstance(v, (h5py.Group, h5py.Dataset)):
-                            value = _get_id(v)
+                            if '@id' in v.attrs:
+                                value = v.attrs['@id']
+                            else:
+                                value = _get_id(v)
                         else:
-                            value = str(v)
+                            if isinstance(v, (list, tuple)):
+                                value = [str(i) for i in v]
+                            else:
+                                value = str(v)
                     j[str(node.iri.predicate[k])] = value
                 else:
                     if not iri_only:
                         j[k] = str(v)
-        entries.append(j)
+        entries[_id] = j
+        # entries.append(j)
 
     _get_dict(grp.name, grp)
 
     if recursive and isinstance(grp, h5py.Group):
         grp.visititems(_get_dict)
         # return grp.visititems(_get_dict)
+
+    # merge entries. e.g. {"@id": "foo", "author": "gro:/123"} and {"@id": "grp:/123", "name": "MP"}
+    # -> {"@id": "foo", "author": {"name": "MP"}}
+    entries = _merge_entries(entries, clean=True)
+
     if len(entries) == 1:
-        return {"@graph": entries[0]}
-    return {"@graph": entries}
+        keys = list(entries.keys())
+        jsonld_dict = {"@graph": entries[keys[0]]}
+    else:
+        jsonld_dict = {"@graph": list(entries.values())}
+
+    if compact:
+        from rdflib import Graph
+        g = Graph().parse(data=json.dumps(jsonld_dict), format='json-ld')
+        return json.loads(g.serialize(format='json-ld', indent=2, compact=True))
+
+    return jsonld_dict
 
 
 def dumps(grp, iri_only=False,
-          file_url="file://./",
+          file_url="",
           recursive: bool = True,
-          **kwargs):
+          compact: bool = False,
+          **kwargs) -> str:
     """Dump a group or a dataset to to string."""
-    return json.dumps(dumpd(grp=grp, iri_only=iri_only, file_url=file_url, recursive=recursive), **kwargs)
+    return json.dumps(dumpd(
+        grp=grp, iri_only=iri_only, file_url=file_url, recursive=recursive, compact=compact),
+        **kwargs
+    )
 
 
 def dump(grp,
          fp,
          iri_only=False,
-         file_url="file://./",
-         recursive: bool = True):
+         file_url="",
+         recursive: bool = True,
+         compact: bool = False,
+         **kwargs):
     """Dump a group or a dataset to to file."""
-    return json.dump(dumpd(grp, iri_only, file_url, recursive), fp, indent=4)
+    return json.dump(
+        dumpd(
+            grp, iri_only, file_url, recursive=recursive, compact=compact
+        ),
+        fp,
+        **kwargs
+    )
diff --git a/h5rdmtoolbox/wrapper/namespaces/_build.py b/h5rdmtoolbox/wrapper/namespaces/_build.py
@@ -5,6 +5,8 @@
 import requests
 import warnings
 
+from rdflib import Graph
+
 __this_dir__ = pathlib.Path(__file__).parent
 
 
@@ -57,7 +59,6 @@ def generate_namespace_file(namespace: str):
 
 def generate_qudt_unit_namespace():
     """Generate the qudt namespace."""
-    from rdflib import Graph
 
     namespace = 'qudt_unit'
 
@@ -87,7 +88,6 @@ def generate_qudt_unit_namespace():
 
 def generate_qudt_quantitykind_namespace():
     """Generate the qudt namespace."""
-    from rdflib import Graph
 
     namespace = 'qudt_quantitykind'
 
@@ -115,8 +115,53 @@ def generate_qudt_quantitykind_namespace():
         f.write('\n\nQUDT_QUANTITYKIND = _QUDT_QUANTITYKIND()')
 
 
+def generate_codemeta_namespace():
+    namespace = 'codemeta'
+    source = 'https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld'
+    context_file = __this_dir__ / f'_{namespace}.jsonld'
+    if not context_file.exists():
+        with open(context_file, 'w', encoding='utf-8') as f:
+            f.write(requests.get(source).text, )
+
+    g = Graph().parse(source, format='json-ld')
+    compact_context = json.loads(g.serialize(format='json-ld', indent=4, auto_compact=True))
+
+    with open(context_file) as f:
+        context = json.load(f)
+
+    uri_refs = {}
+    for k, v in context['@context'].items():
+        if k not in ('type', 'id'):
+            if '@id' in v:
+                if ':' in v['@id']:
+                    _context, value = v['@id'].split(':', 1)
+                    _expanded_context = compact_context['@context'][_context]
+                    uri = _expanded_context + value
+                else:
+                    uri = v['@id']
+                uri_refs[k] = uri
+
+    with open(__this_dir__ / f'_{namespace}_namespace.py', 'w',
+              encoding='UTF8') as f:
+        f.write('# automatically generated from https://codemeta.github.io/terms/\n')
+        f.write('from rdflib.namespace import Namespace\n')
+        f.write('from rdflib.term import URIRef\n\n\n')
+        f.write('class _CODEMETA:')
+
+        for k, v in uri_refs.items():
+            f.write(f'\n    {k} = URIRef("{v}")')
+
+        f.write('\n\n    _NS = Namespace("https://codemeta.github.io/terms/")')
+
+        f.write('\n\n')
+        f.write('\n\nCODEMETA = _CODEMETA()')
+
+    pathlib.Path(context_file).unlink(missing_ok=True)
+
+
 if __name__ == '__main__':
     # generate_namespace_file('m4i')  # be careful, german lines must be manually uncommented
     # generate_namespace_file('obo')  # be careful, german lines must be manually uncommented
-    generate_qudt_unit_namespace()  # write _qudt_namespace.py manually
-    generate_qudt_quantitykind_namespace()  # write _qudt_quantitykind_namespace.py manually
+    # generate_qudt_unit_namespace()  # write _qudt_namespace.py manually
+    # generate_qudt_quantitykind_namespace()  # write _qudt_quantitykind_namespace.py manually
+    generate_codemeta_namespace()
diff --git a/h5rdmtoolbox/wrapper/namespaces/_codemeta_namespace.py b/h5rdmtoolbox/wrapper/namespaces/_codemeta_namespace.py
@@ -0,0 +1,82 @@
+# automatically generated from https://codemeta.github.io/terms/
+from rdflib.namespace import Namespace
+from rdflib.term import URIRef
+
+
+class _CODEMETA:
+    Organization = URIRef("https://schema.org/Organization")
+    Person = URIRef("https://schema.org/Person")
+    SoftwareSourceCode = URIRef("https://schema.org/SoftwareSourceCode")
+    SoftwareApplication = URIRef("https://schema.org/SoftwareApplication")
+    Text = URIRef("https://schema.org/Text")
+    URL = URIRef("https://schema.org/URL")
+    address = URIRef("https://schema.org/address")
+    affiliation = URIRef("https://schema.org/affiliation")
+    applicationCategory = URIRef("https://schema.org/applicationCategory")
+    applicationSubCategory = URIRef("https://schema.org/applicationSubCategory")
+    citation = URIRef("https://schema.org/citation")
+    codeRepository = URIRef("https://schema.org/codeRepository")
+    contributor = URIRef("https://schema.org/contributor")
+    copyrightHolder = URIRef("https://schema.org/copyrightHolder")
+    copyrightYear = URIRef("https://schema.org/copyrightYear")
+    creator = URIRef("https://schema.org/creator")
+    dateCreated = URIRef("https://schema.org/dateCreated")
+    dateModified = URIRef("https://schema.org/dateModified")
+    datePublished = URIRef("https://schema.org/datePublished")
+    description = URIRef("https://schema.org/description")
+    downloadUrl = URIRef("https://schema.org/downloadUrl")
+    email = URIRef("https://schema.org/email")
+    editor = URIRef("https://schema.org/editor")
+    encoding = URIRef("https://schema.org/encoding")
+    familyName = URIRef("https://schema.org/familyName")
+    fileFormat = URIRef("https://schema.org/fileFormat")
+    fileSize = URIRef("https://schema.org/fileSize")
+    funder = URIRef("https://schema.org/funder")
+    givenName = URIRef("https://schema.org/givenName")
+    hasPart = URIRef("https://schema.org/hasPart")
+    identifier = URIRef("https://schema.org/identifier")
+    installUrl = URIRef("https://schema.org/installUrl")
+    isAccessibleForFree = URIRef("https://schema.org/isAccessibleForFree")
+    isPartOf = URIRef("https://schema.org/isPartOf")
+    keywords = URIRef("https://schema.org/keywords")
+    license = URIRef("https://schema.org/license")
+    memoryRequirements = URIRef("https://schema.org/memoryRequirements")
+    name = URIRef("https://schema.org/name")
+    operatingSystem = URIRef("https://schema.org/operatingSystem")
+    permissions = URIRef("https://schema.org/permissions")
+    position = URIRef("https://schema.org/position")
+    processorRequirements = URIRef("https://schema.org/processorRequirements")
+    producer = URIRef("https://schema.org/producer")
+    programmingLanguage = URIRef("https://schema.org/programmingLanguage")
+    provider = URIRef("https://schema.org/provider")
+    publisher = URIRef("https://schema.org/publisher")
+    relatedLink = URIRef("https://schema.org/relatedLink")
+    releaseNotes = URIRef("https://schema.org/releaseNotes")
+    runtimePlatform = URIRef("https://schema.org/runtimePlatform")
+    sameAs = URIRef("https://schema.org/sameAs")
+    softwareHelp = URIRef("https://schema.org/softwareHelp")
+    softwareRequirements = URIRef("https://schema.org/softwareRequirements")
+    softwareVersion = URIRef("https://schema.org/softwareVersion")
+    sponsor = URIRef("https://schema.org/sponsor")
+    storageRequirements = URIRef("https://schema.org/storageRequirements")
+    supportingData = URIRef("https://schema.org/supportingData")
+    targetProduct = URIRef("https://schema.org/targetProduct")
+    url = URIRef("https://schema.org/url")
+    version = URIRef("https://schema.org/version")
+    author = URIRef("https://schema.org/author")
+    softwareSuggestions = URIRef("https://codemeta.github.io/terms/softwareSuggestions")
+    contIntegration = URIRef("https://codemeta.github.io/terms/contIntegration")
+    buildInstructions = URIRef("https://codemeta.github.io/terms/buildInstructions")
+    developmentStatus = URIRef("https://codemeta.github.io/terms/developmentStatus")
+    embargoDate = URIRef("https://codemeta.github.io/terms/embargoDate")
+    funding = URIRef("https://codemeta.github.io/terms/funding")
+    readme = URIRef("https://codemeta.github.io/terms/readme")
+    issueTracker = URIRef("https://codemeta.github.io/terms/issueTracker")
+    referencePublication = URIRef("https://codemeta.github.io/terms/referencePublication")
+    maintainer = URIRef("https://codemeta.github.io/terms/maintainer")
+
+    _NS = Namespace("https://codemeta.github.io/terms/")
+
+
+
+CODEMETA = _CODEMETA()
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = h5rdmtoolbox
-version = 1.2.2
+version = 1.2.1
 author = Matthias Probst
 author_email = matthias.probst@kit.edu
 description = Supporting a FAIR Research Data lifecycle using Python and HDF5.