RDF consts are moved to rdf module

matthiasprobst · Mar 22, 2024 · 5abbcf9 · 5abbcf9
1 parent b7b06ac
commit 5abbcf9
Show file tree

Hide file tree

Showing 13 changed files with 144 additions and 143 deletions.
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ coverage.svg
 docs/colab/my_file.hdf
 *.h5
 *.nxs
+*.json
 *.jsonld
 docs/userguide/wrapper/test.hdf
 docs/userguide/wrapper/test.json
diff --git a/codemeta.json b/codemeta.json
@@ -4,7 +4,7 @@
     "license": "https://spdx.org/licenses/MIT",
     "codeRepository": "git+https://github.com/matthiasprobst/h5RDMtoolbox.git",
     "name": "h5RDMtoolbox",
-    "version": "1.2.3a1",
+    "version": "1.2.3a2",
     "description": "Supporting a FAIR Research Data lifecycle using Python and HDF5.",
     "applicationCategory": "Engineering",
     "programmingLanguage": [

diff --git a/h5rdmtoolbox/_repr.py b/h5rdmtoolbox/_repr.py
@@ -14,6 +14,7 @@
 from . import get_config
 from . import identifiers
 from . import protected_attributes
+from .wrapper.rdf import RDF_SUBJECT_ATTR_NAME, RDF_PREDICATE_ATTR_NAME
 
 H5PY_SPECIAL_ATTRIBUTES = ('DIMENSION_LIST', 'REFERENCE_LIST', 'NAME', 'CLASS', protected_attributes.COORDINATES)
 try:
@@ -201,8 +202,6 @@ def __attrs__(self, name, h5obj):
         """dataset representation"""
 
 
-from . import consts
-
 
 class HDF5StructureStrRepr(_HDF5StructureRepr):
 
@@ -214,7 +213,7 @@ def __call__(self, group, indent=0, preamble=None):
         if predicate:
             print(spaces + f'@predicate: {predicate}')
         for attr_name in group.attrs.raw.keys():
-            if attr_name == consts.RDF_SUBJECT_ATTR_NAME:
+            if attr_name == RDF_SUBJECT_ATTR_NAME:
                 print(spaces + f'@type: {group.attrs[attr_name]}')
             else:
                 if not attr_name.isupper():
@@ -267,7 +266,7 @@ def __group__(self, name, item) -> str:
     def __attrs__(self, name, h5obj) -> str:
         attr_value = h5obj.attrs.raw[name]
 
-        pred = h5obj.rdf[name]['predicate']
+        pred = h5obj.rdf[name][RDF_PREDICATE_ATTR_NAME]
         if pred:
             use_attr_name = f'{name} ({pred})'
         else:
@@ -463,8 +462,6 @@ def __group__(self, name, h5obj: h5py.Group):
             checkbox_state = self.checkbox_state
 
         self_predicate = h5obj.rdf.predicate.get('SELF', None)
-        if self_predicate:
-            print(self_predicate)
         self_subject = h5obj.rdf.subject
 
         if self_predicate is not None:

diff --git a/h5rdmtoolbox/consts.py b/h5rdmtoolbox/consts.py
@@ -1,8 +1,3 @@
 """constants used by the h5rdmtoolbox package"""
 
 ANCILLARY_DATASET = 'ANCILLARY_DATASETS'
-RDF_OBJECT_ATTR_NAME = 'RDF_OBJECT'
-RDF_PREDICATE_ATTR_NAME = 'RDF_PREDICATE'
-# RDF_SUBJECT_ATTR_NAME = 'IRI_SUBJECT'  # '@type'
-RDF_SUBJECT_ATTR_NAME = '@type'
-# IRI_TYPE_ATTR_NAME = '@type'
diff --git a/h5rdmtoolbox/utils.py b/h5rdmtoolbox/utils.py
@@ -18,8 +18,9 @@
 from typing import Dict
 from typing import Union, Callable, List, Tuple
 
-from . import _user, get_config, get_ureg, consts
+from . import _user, get_config, get_ureg
 from ._version import __version__
+from .wrapper import rdf
 
 logger = logging.getLogger('h5rdmtoolbox')
 DEFAULT_LOGGING_LEVEL = logging.INFO
@@ -181,10 +182,10 @@ def create_h5tbx_version_grp(root: h5py.Group) -> h5py.Group:
     version_group = root.create_group('h5rdmtoolbox')
     # g.rdf.object = 'https://schema.org/SoftwareSourceCode'
     version_group.attrs['__h5rdmtoolbox_version__'] = __version__
-    version_group.attrs[consts.RDF_PREDICATE_ATTR_NAME] = json.dumps(
+    version_group.attrs[rdf.RDF_PREDICATE_ATTR_NAME] = json.dumps(
         {'__h5rdmtoolbox_version__': 'https://schema.org/softwareVersion'}
     )
-    version_group.attrs[consts.RDF_SUBJECT_ATTR_NAME] = 'https://schema.org/SoftwareSourceCode'
+    version_group.attrs[rdf.RDF_SUBJECT_ATTR_NAME] = 'https://schema.org/SoftwareSourceCode'
     return version_group
 
 
@@ -307,7 +308,6 @@ def parse_object_for_attribute_setting(value) -> Union[str, int, float, bool, Li
     try:
         return str(value)  # try parsing to string
     except TypeError:
-        print(type(value))
         raise TypeError(f"Cannot parse type {type(value)} to string")
 
 

diff --git a/h5rdmtoolbox/wrapper/core.py b/h5rdmtoolbox/wrapper/core.py
@@ -2,6 +2,7 @@
 """
 import datetime
 import h5py
+import json
 import logging
 import numpy as np
 import os
@@ -16,7 +17,7 @@
 from h5py._hl.base import phil, with_phil
 from h5py._objects import ObjectID
 from pathlib import Path
-from typing import List, Dict, Union, Tuple, Callable
+from typing import List, Dict, Union, Tuple, Callable, Optional
 
 from h5rdmtoolbox.database import ObjDB
 from . import rdf
@@ -744,7 +745,6 @@ def create_dataset(self,
                 if anc_ds.shape != _data.shape:
                     raise ValueError(f'Associated dataset {anc_name} has shape {anc_ds.shape} '
                                      f'which does not match dataset shape {_data.shape}')
-            import json
             attrs[consts.ANCILLARY_DATASET] = json.dumps({k: v.name for k, v in ancillary_datasets.items()})
 
         _maxshape = kwargs.get('maxshape', shape)
@@ -1207,6 +1207,11 @@ def create_from_yaml(self, yaml_filename: Path):
         from . import h5yaml
         h5yaml.H5Yaml(yaml_filename).write(self)
 
+    def create_from_jsonld(self, data: str, context: Optional[Dict] = None):
+        """Create groups/datasets from a jsonld string."""
+        from . import jsonld
+        jsonld.to_hdf(self, data=json.loads(data), context=context)
+
     def _get_obj_names(self, obj_type, recursive):
         """Return all names of specified object type
         in this group and if recursive==True also

diff --git a/h5rdmtoolbox/wrapper/jsonld.py b/h5rdmtoolbox/wrapper/jsonld.py
@@ -8,10 +8,10 @@
 from typing import Dict, Optional, Union, List
 from typing import Iterable, Tuple, Any
 
-import h5rdmtoolbox as h5tbx
-from h5rdmtoolbox import consts
 from h5rdmtoolbox.convention import hdf_ontology
 from ontolutils.classes.utils import split_URIRef
+from .core import Dataset, File
+from .rdf import RDF_PREDICATE_ATTR_NAME
 
 
 def _merge_entries(entries: Dict, clean: bool = True) -> Dict:
@@ -122,7 +122,10 @@ def to_hdf(grp,
 
         if k == '@id':
             rdf_predicate = None
-            value_predicate = k
+            if v.startswith('http'):
+                value_predicate = k
+            else:
+                continue
         else:
             # spit predicate:
             ns_predicate, value_predicate = split_URIRef(k)
@@ -140,90 +143,77 @@ def to_hdf(grp,
                     rdf_predicate = value_predicate
 
         if isinstance(v, dict):
-            print(f'create group {k} in {grp.name}')
             if k not in grp:
                 to_hdf(grp.create_group(value_predicate), data=v, predicate=rdf_predicate, context=data_context)
+
         elif isinstance(v, list):
             if is_list_of_dict(v):
                 for i, entry in enumerate(v):
-                    sub_grp_name = f'{k}{i + 1}'
-                    if sub_grp_name in grp:
-                        sub_grp = grp[sub_grp_name]
+                    # figure out how to name the sub group
+                    # best would be to take the label, if it exists
+                    for label_identifier in ('rdfs:label', 'label', 'http://www.w3.org/2000/01/rdf-schema#'):
+                        _label = entry.get(label_identifier, None)
+                        break
+
+                    if _label is None:
+                        if len(v) > 1:
+                            label = f'{k}{i + 1}'
+                        else:
+                            label = k
+                    else:
+                        ns, label = split_URIRef(_label)
+
+                    if label in grp:
+                        sub_grp = grp[label]
                     else:
-                        sub_grp = grp.create_group(sub_grp_name)
-                        sub_grp.rdf.predicate = data_context.get(k, None)
+                        ns_predicate, rdf_predicate = split_URIRef(k)
+                        if ns_predicate is None:
+                            rdf_predicate = data_context.get(k, None)
+                        elif ns_predicate.startswith('http'):
+                            rdf_predicate = k
+                        else:
+                            _ns = data_context.get(ns_predicate, None)
+                            if _ns is not None:
+                                rdf_predicate = f'{_ns}{value_predicate}'
+                            else:
+                                rdf_predicate = value_predicate
+
+                        sub_grp = grp.create_group(label)
+                        sub_grp.rdf.predicate = rdf_predicate
+
                     to_hdf(sub_grp, data=entry, context=data_context)
             else:
                 grp.attrs[k, data_context.get(k, None)] = v
         else:
             # maybe value_object is a IRI?!
-            ns_object, value_object = split_URIRef(v)
+            rdf_object = None
+            if isinstance(v, str):
+                if v.startswith('http'):
+                    value_object = v
+                else:
+                    ns_object, value_object = split_URIRef(v)
 
-            if ns_object is None:
-                rdf_object = data_context.get(k, None)
-            elif value_object.startswith('http'):
-                rdf_object = k
+                    if ns_object is None:
+                        rdf_object = data_context.get(k, None)
+                    elif value_object.startswith('http'):
+                        rdf_object = k
+                    else:
+                        _ns = data_context.get(ns_object, None)
+                        if _ns is not None:
+                            rdf_object = f'{_ns}{value_object}'
+                        else:
+                            rdf_object = None
             else:
-                _ns = data_context.get(ns_object, None)
-                if _ns is not None:
-                    rdf_object = f'{_ns}{value_object}'
-                else:
-                    rdf_object = value_object
-            if k == '@type':
+                value_object = v
+
+            if k == '@type' and rdf_object is not None:
                 grp.attrs.create(name=k, data=rdf_object)
             elif k == '@id':
                 grp.attrs.create(name=k, data=v)
             else:
                 grp.attrs.create(name=value_predicate, data=value_object, rdf_predicate=rdf_predicate)
 
 
-# def to_hdf(jsonld_filename, grp: h5py.Group) -> None:
-#     """Takes a .jsonld file and writes it into a HDF5 group"""
-#     if not isinstance(grp, h5py.Group):
-#         raise TypeError(f'Expecting h5py.Group, got {type(grp)}')
-#
-#     if not isinstance(jsonld_filename, (str, pathlib.Path)):
-#         raise TypeError(f'Expecting str or pathlib.Path, got {type(jsonld_filename)}')
-#
-#     def _to_hdf(_h5: h5py.Group, jdict: Dict):
-#         """Takes a .jsonld file and writes it into a HDF5 group"""
-#         for k, v in jdict.items():
-#             if isinstance(v, dict):
-#                 if k == 'has parameter':
-#                     label = v.get('label', '@id')
-#                     _h5.attrs[k] = v['@id']
-#                     if v.get('has numerical value', None):
-#                         ds = _h5.create_dataset(label, data=literal_eval(v['has numerical value']), track_order=True)
-#                         for kk, vv in v.items():
-#                             if kk != 'has numerical value':
-#                                 ds.attrs[kk] = vv
-#                     else:
-#                         grp = _h5.create_group(label, track_order=True)
-#                         _to_hdf(grp, v)
-#                 else:
-#                     grp = _h5.create_group(k, track_order=True)
-#                     _to_hdf(grp, v)
-#             elif isinstance(v, list):
-#                 list_grp = _h5.create_group(k, track_order=True)
-#                 for i, item in enumerate(v):
-#                     # _h5[k] =
-#                     obj_name = item.get('@id', str(i))
-#                     if item.get('has numerical value', None):
-#                         obj = list_grp.create_dataset(obj_name, data=literal_eval(item['has numerical value']),
-#                                                       track_order=True)
-#                         for kk, vv in item.items():
-#                             if kk != 'has numerical value':
-#                                 obj.attrs[kk] = vv
-#                     else:
-#                         obj = list_grp.create_group(obj_name, track_order=True)
-#                     _to_hdf(obj, item)
-#             else:
-#                 _h5.attrs[k] = v
-#
-#     with open(jsonld_filename, 'r') as f:
-#         return _to_hdf(grp, json.load(f))
-
-
 def serialize(grp,
               iri_only=False,
               local=None,
@@ -241,9 +231,6 @@ def serialize(grp,
                              recursive=recursive,
                              compact=compact,
                              context=context)
-
-    hasParameter = URIRef('http://w3id.org/nfdi4ing/metadata4ing#hasParameter')
-
     # global _context
     _context = {}
     context = context or {}
@@ -270,7 +257,7 @@ def add_node(name, obj):
         # NumericalVariable or TextVariable
 
         if node_type is None:
-            rdf_predicate_dict = obj.attrs.get(consts.RDF_PREDICATE_ATTR_NAME, None)
+            rdf_predicate_dict = obj.attrs.get(RDF_PREDICATE_ATTR_NAME, None)
             if rdf_predicate_dict and len(rdf_predicate_dict) > 0:
                 if isinstance(obj, h5py.Dataset):
                     if obj.dtype.kind == 'S':
@@ -509,11 +496,11 @@ def _build_group_onto_class(grp):
             data[grp.parent.name].append(ontogrp)
 
     def _build_onto_classes(name, node):
-        if isinstance(node, h5tbx.Dataset):
+        if isinstance(node, Dataset):
             return _build_dataset_onto_class(node)
         return _build_group_onto_class(node)
 
-    with h5tbx.File(filename, mode='r') as h5:
+    with File(filename, mode='r') as h5:
         root = hdf_ontology.Group(name='/', attribute=_build_attributes(h5.attrs))
         data['/'] = []