introducing iri

matthiasprobst · Dec 5, 2023 · 8f8aee3 · 8f8aee3
1 parent 9721f88
commit 8f8aee3
Show file tree

Hide file tree

Showing 13 changed files with 358 additions and 243 deletions.
diff --git a/docs/wrapper/DumpFile.ipynb b/docs/wrapper/DumpFile.ipynb
diff --git a/h5rdmtoolbox/__init__.py b/h5rdmtoolbox/__init__.py
@@ -1,27 +1,27 @@
 """h5rdtoolbox repository"""
 import atexit
 import pathlib
+import shutil
+from typing import Union, Callable
+
 # noinspection PyUnresolvedReferences
 import pint_xarray
-import shutil
 import xarray as xr
-from typing import Union, Callable
 
 from h5rdmtoolbox._cfg import set_config, get_config, get_ureg
 
 pint_xarray.unit_registry = get_ureg()
 
 from . import conventions
 from .conventions.core import Convention
-from . import plotting
+# from . import plotting
 from . import wrapper
 from ._user import UserDir
 from ._version import __version__
 from . import database
 from . import utils
 from .wrapper.core import lower, Lower, File, Group, Dataset
 from . import errors
-
 from .wrapper.accessory import register_special_dataset
 
 name = 'h5rdmtoolbox'

diff --git a/h5rdmtoolbox/_repr.py b/h5rdmtoolbox/_repr.py
@@ -1,14 +1,15 @@
-import h5py
-import numpy as np
 import os
-import pkg_resources
 import re
 import typing
-from IPython.display import HTML, display
 from abc import abstractmethod
-from numpy import ndarray
 from time import perf_counter_ns
 
+import h5py
+import numpy as np
+import pkg_resources
+from IPython.display import HTML, display
+from numpy import ndarray
+
 from . import get_config
 from . import protected_attributes
 from .orcid import is_valid_orcid_pattern, get_html_repr
@@ -83,6 +84,12 @@ def okprint(string):
     print(oktext(string))
 
 
+def make_href(url, text) -> str:
+    if not url.startswith('http'):
+        raise ValueError(f'Invalid URL: "{url}". Must start with "http"')
+    return f'<a href="{url}">{text}</a>'
+
+
 def process_string_for_link(string: str) -> typing.Tuple[str, bool]:
     """process string to make links actually clickable in html
 
@@ -106,7 +113,7 @@ def process_string_for_link(string: str) -> typing.Tuple[str, bool]:
         if string.startswith('https://zenodo.org/record/'):
             zenodo_url = string
             img_url = f'https://zenodo.org/badge/DOI/10.5281/zenodo.{string.split("/")[-1]}.svg'
-        return f'<a href="{zenodo_url}"><img src="{img_url}" alt="DOI"></a>', True
+        return make_href(url=zenodo_url, text=f'<img src="{img_url}" alt="DOI">'), True
     for p in (r"(https?://\S+)", r"(ftp://\S+)", r"(www\.\S+)"):
         urls = re.findall(p, string)
         if urls:
@@ -115,7 +122,7 @@ def process_string_for_link(string: str) -> typing.Tuple[str, bool]:
                     orcid_url_repr = get_html_repr(url)
                     string = string.replace(url, orcid_url_repr)
                 else:
-                    string = string.replace(url, f'<a href="{url}">{url}</a>')
+                    string = string.replace(url, make_href(url, url))
             return string, True
 
     return string, False
@@ -168,15 +175,15 @@ class HDF5StructureStrRepr(_HDF5StructureRepr):
     def __call__(self, group, indent=0, preamble=None):
         if preamble:
             print(preamble)
-        for attr_name, attr_value in group.attrs.raw.items():
+        for attr_name in group.attrs.raw.keys():
             if not attr_name.isupper():
-                print(self.base_intent * indent + self.__attrs__(attr_name, attr_value))
+                print(self.base_intent * indent + self.__attrs__(attr_name, group))
         for key, item in group.items():
             if isinstance(item, h5py.Dataset):
                 print(self.base_intent * indent + self.__dataset__(key, item))
-                for attr_name, attr_value in item.attrs.raw.items():
+                for attr_name in item.attrs.raw.keys():
                     if not attr_name.isupper() and attr_name not in self.ignore_attrs:
-                        print(self.base_intent * (indent + 2) + self.__attrs__(attr_name, attr_value))
+                        print(self.base_intent * (indent + 2) + self.__attrs__(attr_name, item))
             elif isinstance(item, h5py.Group):
                 print(self.base_intent * indent + self.__group__(key, item))
                 self(item, indent + 1)
@@ -337,9 +344,9 @@ def __dataset__(self, name, h5obj) -> str:
         # open attribute section:
         _html_ds_attrs = """\n                <ul class="h5tb-attr-list">"""
         # write attributes:
-        for k, v in h5obj.attrs.items():
+        for k in h5obj.attrs.keys():
             if k not in self.ignore_attrs and not k.isupper():
-                _html_ds_attrs += self.__attrs__(k, v)
+                _html_ds_attrs += self.__attrs__(k, h5obj)
         # close attribute section
         _html_ds_attrs += """\n                </ul>"""
 
@@ -370,8 +377,9 @@ def __group__(self, name, h5obj: h5py.Group):
         _html += """\n
                     <ul class="h5tb-attr-list">"""
         # write attributes:
-        for k, v in h5obj.attrs.items():
-            _html += self.__attrs__(k, v)
+        for k in h5obj.attrs.keys():
+            if not k.isupper():
+                _html += self.__attrs__(k, h5obj)
         # close attribute section
         _html += """
                     </ul>"""
@@ -389,16 +397,21 @@ def __group__(self, name, h5obj: h5py.Group):
         return _html
 
     def __attrs__(self, name, h5obj):
-
-        if name in ('DIMENSION_LIST', 'REFERENCE_LIST'):
-            _value = h5obj.__str__().replace('<', '&#60;')
-            _value = _value.replace('>', '&#62;')
-            return f'<li style="list-style-type: none; font-style: italic">{name} : {_value}</li>'
-
-        if isinstance(h5obj, ndarray):
-            if all(isinstance(item, str) for item in h5obj):
+        attr_value = h5obj.attrs.raw[name]
+        # if name.isupper():
+        # # if name in ('DIMENSION_LIST', 'REFERENCE_LIST'):
+        #     _value = attr_value.__str__().replace('<', '&#60;')
+        #     _value = _value.replace('>', '&#62;')
+        #     return f'<li style="list-style-type: none; font-style: italic">{name} : {_value}</li>'
+
+        iri_value = h5obj.iri.get(name, None)
+        if iri_value:
+            name = make_href(iri_value, name)
+
+        if isinstance(attr_value, ndarray):
+            if all(isinstance(item, str) for item in attr_value):
                 _string_value_list = []
-                for item in h5obj:
+                for item in attr_value:
                     _value, is_url = process_string_for_link(item)
                     if is_url:
                         _string_value_list.append(_value)
@@ -407,13 +420,13 @@ def __attrs__(self, name, h5obj):
                 return '<li style="list-style-type: none; ' \
                        f'font-style: italic">{name} : {", ".join(_string_value_list)}</li>'
             else:
-                _value = h5obj.__repr__()
+                _value = attr_value.__repr__()
                 if len(_value) > self.max_attr_length:
                     _value = f'{_value[0:self.max_attr_length]}...'
                 return f'<li style="list-style-type: none; font-style: italic">{name} : {_value}</li>'
 
-        if isinstance(h5obj, str):
-            _value_str = f'{h5obj}'
+        if isinstance(attr_value, str):
+            _value_str = f'{attr_value}'
             if len(_value_str) > 1:
                 if _value_str[0] == '<' and _value_str[-1] == '>':
                     _value_str = _value_str[1:-1]
@@ -422,35 +435,36 @@ def __attrs__(self, name, h5obj):
             if is_url:
                 if 'orcid.org' in _value:
                     from . import orcid
-                    orcid_html = orcid.get_html_repr(h5obj.strip('/').rsplit('/', 1)[-1])
+                    orcid_html = orcid.get_html_repr(attr_value.strip('/').rsplit('/', 1)[-1])
                     return f'<li style="list-style-type: none; font-style: italic">{name} : {orcid_html}</li>'
             else:
                 if self.max_attr_length:
                     if len(_value_str) > self.max_attr_length:
                         _value_str = f'{_value_str[0:self.max_attr_length - 3]}...'
                     else:
-                        _value_str = h5obj
+                        _value_str = attr_value
                 else:
-                    _value_str = h5obj
+                    _value_str = attr_value
             #
             # if len(_value_str) > self.max_attr_length:
             #     _value_str = f'{_value_str[0:self.max_attr_length-1]}...'
+            # print(f'<li style="list-style-type: none; font-style: italic">{name} : {_value_str}</li>')
             return f'<li style="list-style-type: none; font-style: italic">{name} : {_value_str}</li>'
 
-        if not isinstance(h5obj, ndarray):
-            if getattr(h5obj, '_repr_html_', None):
-                _value_str = h5obj._repr_html_()
+        if not isinstance(attr_value, ndarray):
+            if getattr(attr_value, '_repr_html_', None):
+                _value_str = attr_value._repr_html_()
             else:
-                _value_str = str(h5obj)
+                _value_str = str(attr_value)
                 if _value_str[0] == '<' and _value_str[-1] == '>':
                     _value_str = _value_str[1:-1]
                 if self.max_attr_length:
                     if len(_value_str) > self.max_attr_length:
                         _value_str = f'{_value_str[0:self.max_attr_length - 3]}...'
                     else:
-                        _value_str = h5obj
+                        _value_str = attr_value
                 else:
-                    _value_str = h5obj
+                    _value_str = attr_value
 
         return f'<li style="list-style-type: none; font-style: italic">{name} : {_value_str}</li>'
 

diff --git a/h5rdmtoolbox/consts.py b/h5rdmtoolbox/consts.py
@@ -1,3 +1,4 @@
 """constants used by the h5rdmtoolbox package"""
 
-ANCILLARY_DATASET = 'ANCILLARY_DATASETS'
+ANCILLARY_DATASET = 'ANCILLARY_DATASETS'
+IRI_ATTR_NAME = 'IRI'
diff --git a/h5rdmtoolbox/conventions/standard_names/table.py b/h5rdmtoolbox/conventions/standard_names/table.py
@@ -807,7 +807,10 @@ def to_html(self, html_filename, open_in_browser: bool = False) -> pathlib.Path:
             raise FileNotFoundError(f'Could not find the template file at {template_filename.absolute()}')
 
         # Convert Markdown to HTML using pandoc
-        import pypandoc
+        try:
+            import pypandoc
+        except ImportError:
+            raise ImportError('Package "pypandoc" is required for this function.')
         output = pypandoc.convert_file(str(markdown_filename.absolute()), 'html', format='md',
                                        extra_args=['--template', template_filename])
 

diff --git a/h5rdmtoolbox/iri.py b/h5rdmtoolbox/iri.py
@@ -0,0 +1,38 @@
+class IRIManager:
+    """Manager class to handle IRIs of opened HDF5 files"""
+
+    def __init__(self):
+        self.registries = {}
+
+    def __contains__(self, item):
+        return item in self.registries
+
+    def get(self, name, existing_iri=None):
+        if name not in self.registries:
+            if existing_iri is None:
+                existing_iri = {}
+            self.registries[name] = existing_iri
+        return self.registries[name]
+
+
+class IRI:
+    """Helper class to store a IRI (international resource identifier) as an attribute.
+    It will write the value to the attribute and store the IRI in a separate attribute
+    (see constant `ATTRIRI`).
+
+    Example:
+    --------
+    >>> import h5rdmtoolbox as h5tbx
+    >>> with h5tbx.File() as h5:
+    >>>     h5.attrs['creator'] = h5tbx.IRI('https://orcid.org/0000-0001-8729-0482')
+    """
+
+    def __init__(self, value, iri):
+        self.iri = iri
+        self.value = value
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({self.value}, {self.iri})'
+
+
+irimanager = IRIManager()
diff --git a/h5rdmtoolbox/tutorial.py b/h5rdmtoolbox/tutorial.py
@@ -3,9 +3,10 @@
 """
 import os
 import pathlib
-import xarray as xr
 from typing import List
 
+import xarray as xr
+
 from h5rdmtoolbox.conventions.standard_names.table import StandardNameTable
 from .utils import generate_temporary_directory
 from .wrapper.core import File
@@ -186,7 +187,7 @@ def _get_pressure(v):
 
         _folders = ('d1', 'd2', 'd3', 'd1/d11', 'd1/d11/d111', 'd2/d21')
         folders = [os.path.join(repo_dir, _f) for _f in _folders]
-        operators = ('Mike', 'Ellen', 'John', 'Susi')
+        contact_persons = ('Mike', 'Ellen', 'John', 'Susi')
         db_file_type = ('fan_case', 'piv_case')
 
         file_ids = range(n_files)
@@ -200,7 +201,9 @@ def _get_pressure(v):
 
             filename = pathlib.Path(folders[ifolder]) / f'repofile_{fid:05d}.hdf'
             with File(filename, 'w') as h5:
-                h5.attrs['operator'] = operators[np.random.randint(4)]
+                h5.attrs['contact_person'] = contact_persons[np.random.randint(4)]
+                h5.iri['contact_person'] = 'http://www.w3.org/ns/prov#Person'
+
                 if fid % 2:
                     __ftype__ = db_file_type[0]
                 else:
@@ -239,6 +242,7 @@ def _get_pressure(v):
                                      shape=(zplanes, 64, 86, 2))
                     g.create_dataset('v', attrs={'units': 'm/s', 'long_name': 'mean v-component'},
                                      shape=(zplanes, 64, 86, 2))
+                    g.iri['units'] = 'http://qudt.org/schema/qudt/Unit'
 
     @staticmethod
     def generate_test_files(n_files: int = 5) -> List[pathlib.Path]: