simplify db interface

matthiasprobst · Apr 24, 2024 · 8c3c525 · 8c3c525
1 parent 8fdf264
commit 8c3c525
Show file tree

Hide file tree

Showing 19 changed files with 812 additions and 642 deletions.
diff --git a/docs/colab/example.hdf b/docs/colab/example.hdf
diff --git a/docs/practical_examples/knowledge_graph.ipynb b/docs/practical_examples/knowledge_graph.ipynb
diff --git a/docs/userguide/database/firstSteps.ipynb b/docs/userguide/database/firstSteps.ipynb
@@ -8,8 +8,8 @@
     "# First steps: HDF5 and databases\n",
     "\n",
     "There are two ways of working with HDF5 and databases:\n",
-    "1. Using HDF5 file(s) as a database\n",
-    "2. Writing HDF5 content into dedicated database solutions\n",
+    "1. Using HDF5 file(s) as a database itself.\n",
+    "2. Writing HDF5 content into dedicated database solutions.\n",
     "\n",
     "Both ways will be described in the next two chapters. The second approach is currently implemented for a [mongoDB](https://pymongo.readthedocs.io/en/stable/)-interface.\n",
     "\n",
@@ -989,7 +989,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.18"
+   "version": "3.8.19"
   }
  },
  "nbformat": 4,

diff --git a/docs/userguide/database/hdfDB.ipynb b/docs/userguide/database/hdfDB.ipynb
diff --git a/docs/userguide/database/index.rst b/docs/userguide/database/index.rst
@@ -1,17 +1,14 @@
 Database
 ========
 
-After creating one or multiple HDF5 files or received it from someone else, we would like to explore the data. Better said, we want to identify groups or datasets 
-based on their attribute values.
-
 HDF5 can be considered a database itself, as it allows multiple datasets and their metadata (attributes) to be stored in a single file. Most of the time, you want to find records in an HDF5 file based on the attributes. However, the `h5py` package does not provide a function to do this.
 
 The *h5rdmtoolbox* provides an interface to perform queries on a single or even multiple HDF5 files.
-This is shown in one of the subchapters here. However, this may not always be the fastest way to find data
-in an HDF5 file. A more effective way is to map the metadata to a dedicated database. One such example is MongoDB.
+This is shown in one of the sub-chapters here. However, this may not always be the fastest way to find data
+in an HDF5 file. A more effective way is to map the metadata to a dedicated database. One such example - and implemented in the toolbox, is MongoDB.
 The query is performed on the much more efficient dedicated database, then returned to the original file to continue working.
 
-You may implement a new interface between HDF5 and your preferable database solution yourself.
+Note, that you may implement a custom interface between HDF5 and your preferable database solution yourself, too.
 For this, please inherit from the abstract class `h5rdmtoolbox.database.interface.HDF5DBInterface`. The
 following class diagram shows the architecture of the database interfaces:
 

diff --git a/h5rdmtoolbox/database/__init__.py b/h5rdmtoolbox/database/__init__.py
@@ -1,7 +1,81 @@
+import pathlib
+from typing import Optional
+from typing import Protocol
+
 from . import lazy
 from .hdfdb import FileDB
 from .hdfdb import FilesDB, ObjDB
+from .interface import HDF5DBInterface
+
+
+def find(source, *args, **kwargs):
+    if isinstance(source, (str, pathlib.Path)):
+        return FileDB(source).find(*args, **kwargs)
+    elif isinstance(source, (list, tuple)):
+        return FilesDB(source).find(*args, **kwargs)
+    else:
+        return ObjDB(source).find(*args, **kwargs)
+
+
+def find_one(source, *args, **kwargs):
+    if isinstance(source, (str, pathlib.Path)):
+        return FileDB(source).find_one(*args, **kwargs)
+    elif isinstance(source, (list, tuple)):
+        return FilesDB(source).find_one(*args, **kwargs)
+    else:
+        return ObjDB(source).find_one(*args, **kwargs)
+
+
+class RDFQuerySource(Protocol):
+    def rdf(self, *args, **kwargs):
+        pass
+
+
+def rdf_find(source, *,
+             rdf_subject: Optional[str] = None,
+             rdf_type: Optional[str] = None,
+             rdf_predicate: Optional[str] = None,
+             rdf_object: Optional[str] = None,
+             recursive: bool = True):
+    """Find function for RDF triples
+
+    Parameters
+    ----------
+    source: Union[str, pathlib.Path, h5tbx.Group]
+        Filename or hdf group
+    """
+    if isinstance(source, (str, pathlib.Path)):
+        return FileDB(source).rdf_find(rdf_subject=rdf_subject,
+                                       rdf_type=rdf_type,
+                                       rdf_predicate=rdf_predicate,
+                                       rdf_object=rdf_object,
+                                       recursive=recursive)
+    elif isinstance(source, (list, tuple)):
+        return FilesDB(source).rdf_find(rdf_subject=rdf_subject,
+                                        rdf_type=rdf_type,
+                                        rdf_predicate=rdf_predicate,
+                                        rdf_object=rdf_object,
+                                        recursive=recursive)
+    else:
+        return ObjDB(source).rdf_find(rdf_subject=rdf_subject,
+                                      rdf_type=rdf_type,
+                                      rdf_predicate=rdf_predicate,
+                                      rdf_object=rdf_object,
+                                      recursive=recursive)
+    # from .. import File
+    # from .lazy import lazy
+    # if isinstance(source, (str, pathlib.Path)):
+    #     with File(source) as h5:
+    #         return rdf_find(h5, rdf_subject=rdf_subject,
+    #                         rdf_type=rdf_type,
+    #                         rdf_predicate=rdf_predicate,
+    #                         rdf_object=rdf_object,
+    #                         recursive=recursive)
+    # return lazy(source.rdf.find(rdf_subject=rdf_subject,
+    #                             rdf_type=rdf_type,
+    #                             rdf_predicate=rdf_predicate,
+    #                             rdf_object=rdf_object,
+    #                             recursive=recursive))
 
-from .template import HDF5DBInterface
 
 __all__ = ['lazy', 'FileDB', 'FilesDB', 'ObjDB', 'HDF5DBInterface']
diff --git a/h5rdmtoolbox/database/hdfdb/__init__.py b/h5rdmtoolbox/database/hdfdb/__init__.py
@@ -1,4 +1,18 @@
+import pathlib
+
+import h5rdmtoolbox as h5tbx
 from .filedb import FileDB, FilesDB
 from .objdb import ObjDB
 
-__all__ = ['ObjDB', 'FileDB', 'FilesDB']
+
+def find(source, *args, **kwargs):
+    if isinstance(source, (str, pathlib.Path)):
+        with h5tbx.File(source, mode='r') as h5:
+            return find(h5, *args, **kwargs)
+    elif isinstance(source, (list, tuple)):
+        raise NotImplementedError('find does not support multiple sources')
+    else:
+        return ObjDB(source).find(*args, **kwargs)
+
+
+__all__ = ['ObjDB', 'FileDB', 'FilesDB', 'find']
diff --git a/h5rdmtoolbox/database/hdfdb/filedb.py b/h5rdmtoolbox/database/hdfdb/filedb.py
@@ -1,11 +1,11 @@
-import h5py
 import pathlib
-from typing import Union, Generator, List
+from typing import Union, Generator, List, Optional
+
+import h5py
 
-from .nonsearchable import NonInsertableDatabaseInterface
 from .objdb import ObjDB
 from .. import lazy
-from ..template import HDF5DBInterface
+from ..interface import HDF5DBInterface, NonInsertableDatabaseInterface
 
 
 class FileDB(NonInsertableDatabaseInterface, HDF5DBInterface):
@@ -15,6 +15,7 @@ def __init__(self, filename: Union[str, pathlib.Path]):
         self.filename: str = str(filename)
         self.find = self._instance_find  # allow `find` to be a static method and instance method
         self.find_one = self._instance_find_one  # allow `find_one` to be a static method and instance method
+        self.rdf_find = self._instance_rdf_find
 
     @staticmethod
     def find_one(filename: Union[str, pathlib.Path], *args, **kwargs) -> lazy.LHDFObject:
@@ -26,6 +27,19 @@ def _instance_find(self, *args, **kwargs):
         with h5py.File(self.filename, 'r') as h5:
             return list(ObjDB(h5).find(*args, **kwargs))
 
+    def _instance_rdf_find(self, *,
+             rdf_subject: Optional[str] = None,
+             rdf_type: Optional[str] = None,
+             rdf_predicate: Optional[str] = None,
+             rdf_object: Optional[str] = None,
+             recursive: bool = True):
+        with h5py.File(self.filename, 'r') as h5:
+            return list(ObjDB(h5).rdf_find(rdf_subject=rdf_subject,
+                        rdf_type=rdf_type,
+                        rdf_predicate=rdf_predicate,
+                        rdf_object=rdf_object,
+                        recursive=recursive))
+
     def _instance_find_one(self, *args, **kwargs):
         with h5py.File(self.filename, 'r') as h5:
             return ObjDB(h5).find_one(*args, **kwargs)
@@ -34,14 +48,15 @@ def _instance_find_one(self, *args, **kwargs):
     def find(file_or_filename, *args, **kwargs) -> Generator[lazy.LHDFObject, None, None]:
         """Please refer to the docstring of the find method of the ObjDB class"""
         if isinstance(file_or_filename, (h5py.Group, h5py.Dataset)):
-            results = list(ObjDB(file_or_filename).find(*args, **kwargs))
-            for r in results:
-                yield r
+            return list(ObjDB(file_or_filename).find(*args, **kwargs))
+            # for r in results:
+            #     yield r
         else:
             with h5py.File(file_or_filename, 'r') as h5:
                 results = list(ObjDB(h5).find(*args, **kwargs))
-            for r in results:
-                yield r
+            return results
+            # for r in results:
+            #     yield r
 
 
 class FilesDB(NonInsertableDatabaseInterface, HDF5DBInterface):
@@ -88,6 +103,6 @@ def find(self, *args, **kwargs) -> Generator[lazy.LHDFObject, None, None]:
         """Call find on all the files"""
         for filename in self.filenames:
             with h5py.File(filename, 'r') as h5:
-                ret = ObjDB(h5).find(*args, **kwargs)
-                for r in ret:
-                    yield r
+                return ObjDB(h5).find(*args, **kwargs)
+                # for r in ret:
+                #     yield r
diff --git a/h5rdmtoolbox/database/hdfdb/nonsearchable.py b/h5rdmtoolbox/database/hdfdb/nonsearchable.py
diff --git a/h5rdmtoolbox/database/hdfdb/objdb.py b/h5rdmtoolbox/database/hdfdb/objdb.py
@@ -1,11 +1,11 @@
+from typing import Union, Dict, List, Callable, Generator, Optional
+
 import h5py
 import numpy as np
-from typing import Union, Dict, List, Callable, Generator, Optional
 
 from . import query, utils
-from .nonsearchable import NonInsertableDatabaseInterface
 from .. import lazy
-from ..template import HDF5DBInterface
+from ..interface import HDF5DBInterface, NonInsertableDatabaseInterface
 
 
 def basename(name: str) -> str:
@@ -426,6 +426,7 @@ def __init__(self, obj: Union[h5py.Dataset, h5py.Group]):
         else:
             raise TypeError(f'Unexpected type: {type(obj)}')
         self.find = self._instance_find  # allow `find` to be a static method and instance method
+        self.rdf_find = self._instance_rdf_find  # allow `find` to be a static method and instance method
         self.find_one = self._instance_find_one  # allow `find_one` to be a static method and instance method
 
     @staticmethod
@@ -473,7 +474,7 @@ def _instance_find(self,
                        flt: Union[Dict, str],
                        objfilter=None,
                        recursive: bool = True,
-                       ignore_attribute_error: bool = False) -> Generator[lazy.LHDFObject, None, None]:
+                       ignore_attribute_error: bool = False) -> List[lazy.LHDFObject]:
         if isinstance(self.src_obj, h5py.Dataset) and recursive:
             recursive = False
         results = find(self.src_obj,
@@ -482,9 +483,25 @@ def _instance_find(self,
                        recursive=recursive,
                        find_one=False,
                        ignore_attribute_error=ignore_attribute_error)
-
-        for r in results:
-            yield r
+        return results
+
+    def _instance_rdf_find(self, *,
+                           rdf_subject: Optional[str] = None,
+                           rdf_type: Optional[str] = None,
+                           rdf_predicate: Optional[str] = None,
+                           rdf_object: Optional[str] = None,
+                           recursive: bool = True) -> List[lazy.LHDFObject]:
+        """Find objects based on rdf triples"""
+        import h5rdmtoolbox as h5tbx
+        if isinstance(self.src_obj, h5py.Group):
+            src_obj = h5tbx.Group(self.src_obj)
+        else:
+            src_obj = h5tbx.Dataset(self.src_obj)
+        return lazy.lazy(src_obj.rdf.find(rdf_subject=rdf_subject,
+                                          rdf_type=rdf_type,
+                                          rdf_predicate=rdf_predicate,
+                                          rdf_object=rdf_object,
+                                          recursive=recursive))
 
     def distinct(self, key: str,
                  objfilter: Optional[Union[h5py.Group, h5py.Dataset]] = None):

diff --git a/h5rdmtoolbox/database/hdfdb/query.py b/h5rdmtoolbox/database/hdfdb/query.py
@@ -1,9 +1,10 @@
 """query module"""
 import logging
-import numpy as np
 import re
 import warnings
 
+import numpy as np
+
 logger = logging.getLogger('h5rdmtoolbox')
 
 

diff --git a/h5rdmtoolbox/database/template.py → h5rdmtoolbox/database/interface.py b/h5rdmtoolbox/database/template.py → h5rdmtoolbox/database/interface.py
@@ -1,10 +1,23 @@
 import abc
-import h5py
 from typing import Generator
 
+import h5py
+
 from .lazy import LHDFObject
 
 
+class NonInsertableDatabaseInterface:
+    """A database interface that does not allow inserting datasets"""
+
+    def insert_dataset(self, *args, **kwargs):
+        """Insert a dataset. This is not possible for an HDF5 file."""
+        raise NotImplementedError('By using an HDF5 file as a database, you cannot insert datasets')
+
+    def insert_group(self, *args, **kwargs):
+        """Insert a group. This is not possible for an HDF5 file."""
+        raise NotImplementedError('By using an HDF5 file as a database, you cannot insert groups')
+
+
 class HDF5DBInterface(abc.ABC):
     """Abstract HDF5 Database interface.
 

diff --git a/h5rdmtoolbox/database/lazy.py b/h5rdmtoolbox/database/lazy.py
@@ -176,7 +176,7 @@ def _get_dataset_properties(h5obj, keys):
 
 
 def lazy(h5obj: Union[List[Union[h5py.Group, h5py.Dataset, LHDFObject]],
-                      h5py.Dataset, h5py.Group, LHDFObject]) -> Union[None, LDataset, LGroup]:
+                      h5py.Dataset, h5py.Group, LHDFObject]) -> Union[None, List[LHDFObject], LHDFObject]:
     """Make a lazy object from a h5py object"""
     if isinstance(h5obj, LHDFObject):
         return h5obj

diff --git a/h5rdmtoolbox/database/mongo.py b/h5rdmtoolbox/database/mongo.py
@@ -8,7 +8,7 @@
 from typing import List, Dict, Any, Union, Generator
 
 from . import lazy
-from .template import HDF5DBInterface
+from .interface import HDF5DBInterface
 from .. import protected_attributes
 
 

diff --git a/h5rdmtoolbox/tutorial.py b/h5rdmtoolbox/tutorial.py
@@ -1,12 +1,14 @@
 """
 Tutorial module providing easy access to particular data.
 """
-import numpy as np
 import os
 import pathlib
-import xarray as xr
 from typing import List
 
+import numpy as np
+import xarray as xr
+from rdflib import FOAF
+
 import h5rdmtoolbox as h5tbx
 from h5rdmtoolbox.convention.standard_names.table import StandardNameTable
 from .utils import generate_temporary_directory
@@ -326,7 +328,10 @@ def generate_fluid_hdf_file() -> pathlib.Path:
     with h5tbx.File() as h5:
         h5.write_iso_timestamp(name='timestamp', dt=None)  # writes the current date time in iso format to the attribute
         h5.attrs['project'] = 'tutorial'
-        h5.attrs['contact'] = {'name': 'John Doe', 'surname': 'Doe'}
+        contact_grp = h5.create_group('contact')
+        contact_grp.attrs['name', FOAF.firstName] = 'John'
+        contact_grp.attrs['surname', FOAF.lastName] = 'Doe'
+
         h5.attrs['check_value'] = 0
         h5.create_dataset('pressure1', data=np.random.random(size=10) * 800,
                           attrs=dict(units='Pa', standard_name='pressure',

diff --git a/h5rdmtoolbox/wrapper/core.py b/h5rdmtoolbox/wrapper/core.py
@@ -884,7 +884,8 @@ def find_one(self, flt: Union[Dict, str],
         """See ObjDB.find_one()"""
         return ObjDB(self).find_one(flt, objfilter, recursive, ignore_attribute_error)
 
-    def find(self, flt: Union[Dict, str],
+    def find(self,
+             flt: Union[Dict, str],
              objfilter: Union[str, h5py.Dataset, h5py.Group, None] = None,
              recursive: bool = True,
              ignore_attribute_error: bool = False) -> Generator[LHDFObject, None, None]: