Skip to content

Commit

Permalink
simplify db interface
Browse files Browse the repository at this point in the history
  • Loading branch information
matthiasprobst committed Apr 24, 2024
1 parent 8fdf264 commit 8c3c525
Show file tree
Hide file tree
Showing 19 changed files with 812 additions and 642 deletions.
Binary file added docs/colab/example.hdf
Binary file not shown.
415 changes: 415 additions & 0 deletions docs/practical_examples/knowledge_graph.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions docs/userguide/database/firstSteps.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"# First steps: HDF5 and databases\n",
"\n",
"There are two ways of working with HDF5 and databases:\n",
"1. Using HDF5 file(s) as a database\n",
"2. Writing HDF5 content into dedicated database solutions\n",
"1. Using HDF5 file(s) as a database itself.\n",
"2. Writing HDF5 content into dedicated database solutions.\n",
"\n",
"Both ways will be described in the next two chapters. The second approach is currently implemented for a [mongoDB](https://pymongo.readthedocs.io/en/stable/)-interface.\n",
"\n",
Expand Down Expand Up @@ -989,7 +989,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
"version": "3.8.19"
}
},
"nbformat": 4,
Expand Down
717 changes: 137 additions & 580 deletions docs/userguide/database/hdfDB.ipynb

Large diffs are not rendered by default.

9 changes: 3 additions & 6 deletions docs/userguide/database/index.rst
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
Database
========

After creating one or multiple HDF5 files or received it from someone else, we would like to explore the data. Better said, we want to identify groups or datasets
based on their attribute values.

HDF5 can be considered a database itself, as it allows multiple datasets and their metadata (attributes) to be stored in a single file. Most of the time, you want to find records in an HDF5 file based on the attributes. However, the `h5py` package does not provide a function to do this.

The *h5rdmtoolbox* provides an interface to perform queries on a single or even multiple HDF5 files.
This is shown in one of the subchapters here. However, this may not always be the fastest way to find data
in an HDF5 file. A more effective way is to map the metadata to a dedicated database. One such example is MongoDB.
This is shown in one of the sub-chapters here. However, this may not always be the fastest way to find data
in an HDF5 file. A more effective way is to map the metadata to a dedicated database. One such example - and implemented in the toolbox, is MongoDB.
The query is performed on the much more efficient dedicated database, then returned to the original file to continue working.

You may implement a new interface between HDF5 and your preferable database solution yourself.
Note, that you may implement a custom interface between HDF5 and your preferable database solution yourself, too.
For this, please inherit from the abstract class `h5rdmtoolbox.database.interface.HDF5DBInterface`. The
following class diagram shows the architecture of the database interfaces:

Expand Down
76 changes: 75 additions & 1 deletion h5rdmtoolbox/database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,81 @@
import pathlib
from typing import Optional
from typing import Protocol

from . import lazy
from .hdfdb import FileDB
from .hdfdb import FilesDB, ObjDB
from .interface import HDF5DBInterface


def find(source, *args, **kwargs):
if isinstance(source, (str, pathlib.Path)):
return FileDB(source).find(*args, **kwargs)
elif isinstance(source, (list, tuple)):
return FilesDB(source).find(*args, **kwargs)
else:
return ObjDB(source).find(*args, **kwargs)


def find_one(source, *args, **kwargs):
if isinstance(source, (str, pathlib.Path)):
return FileDB(source).find_one(*args, **kwargs)
elif isinstance(source, (list, tuple)):
return FilesDB(source).find_one(*args, **kwargs)
else:
return ObjDB(source).find_one(*args, **kwargs)


class RDFQuerySource(Protocol):
def rdf(self, *args, **kwargs):
pass


def rdf_find(source, *,
rdf_subject: Optional[str] = None,
rdf_type: Optional[str] = None,
rdf_predicate: Optional[str] = None,
rdf_object: Optional[str] = None,
recursive: bool = True):
"""Find function for RDF triples
Parameters
----------
source: Union[str, pathlib.Path, h5tbx.Group]
Filename or hdf group
"""
if isinstance(source, (str, pathlib.Path)):
return FileDB(source).rdf_find(rdf_subject=rdf_subject,
rdf_type=rdf_type,
rdf_predicate=rdf_predicate,
rdf_object=rdf_object,
recursive=recursive)
elif isinstance(source, (list, tuple)):
return FilesDB(source).rdf_find(rdf_subject=rdf_subject,
rdf_type=rdf_type,
rdf_predicate=rdf_predicate,
rdf_object=rdf_object,
recursive=recursive)
else:
return ObjDB(source).rdf_find(rdf_subject=rdf_subject,
rdf_type=rdf_type,
rdf_predicate=rdf_predicate,
rdf_object=rdf_object,
recursive=recursive)
# from .. import File
# from .lazy import lazy
# if isinstance(source, (str, pathlib.Path)):
# with File(source) as h5:
# return rdf_find(h5, rdf_subject=rdf_subject,
# rdf_type=rdf_type,
# rdf_predicate=rdf_predicate,
# rdf_object=rdf_object,
# recursive=recursive)
# return lazy(source.rdf.find(rdf_subject=rdf_subject,
# rdf_type=rdf_type,
# rdf_predicate=rdf_predicate,
# rdf_object=rdf_object,
# recursive=recursive))

from .template import HDF5DBInterface

__all__ = ['lazy', 'FileDB', 'FilesDB', 'ObjDB', 'HDF5DBInterface']
16 changes: 15 additions & 1 deletion h5rdmtoolbox/database/hdfdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
import pathlib

import h5rdmtoolbox as h5tbx
from .filedb import FileDB, FilesDB
from .objdb import ObjDB

__all__ = ['ObjDB', 'FileDB', 'FilesDB']

def find(source, *args, **kwargs):
if isinstance(source, (str, pathlib.Path)):
with h5tbx.File(source, mode='r') as h5:
return find(h5, *args, **kwargs)
elif isinstance(source, (list, tuple)):
raise NotImplementedError('find does not support multiple sources')
else:
return ObjDB(source).find(*args, **kwargs)


__all__ = ['ObjDB', 'FileDB', 'FilesDB', 'find']
39 changes: 27 additions & 12 deletions h5rdmtoolbox/database/hdfdb/filedb.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import h5py
import pathlib
from typing import Union, Generator, List
from typing import Union, Generator, List, Optional

import h5py

from .nonsearchable import NonInsertableDatabaseInterface
from .objdb import ObjDB
from .. import lazy
from ..template import HDF5DBInterface
from ..interface import HDF5DBInterface, NonInsertableDatabaseInterface


class FileDB(NonInsertableDatabaseInterface, HDF5DBInterface):
Expand All @@ -15,6 +15,7 @@ def __init__(self, filename: Union[str, pathlib.Path]):
self.filename: str = str(filename)
self.find = self._instance_find # allow `find` to be a static method and instance method
self.find_one = self._instance_find_one # allow `find_one` to be a static method and instance method
self.rdf_find = self._instance_rdf_find

@staticmethod
def find_one(filename: Union[str, pathlib.Path], *args, **kwargs) -> lazy.LHDFObject:
Expand All @@ -26,6 +27,19 @@ def _instance_find(self, *args, **kwargs):
with h5py.File(self.filename, 'r') as h5:
return list(ObjDB(h5).find(*args, **kwargs))

def _instance_rdf_find(self, *,
rdf_subject: Optional[str] = None,
rdf_type: Optional[str] = None,
rdf_predicate: Optional[str] = None,
rdf_object: Optional[str] = None,
recursive: bool = True):
with h5py.File(self.filename, 'r') as h5:
return list(ObjDB(h5).rdf_find(rdf_subject=rdf_subject,
rdf_type=rdf_type,
rdf_predicate=rdf_predicate,
rdf_object=rdf_object,
recursive=recursive))

def _instance_find_one(self, *args, **kwargs):
with h5py.File(self.filename, 'r') as h5:
return ObjDB(h5).find_one(*args, **kwargs)
Expand All @@ -34,14 +48,15 @@ def _instance_find_one(self, *args, **kwargs):
def find(file_or_filename, *args, **kwargs) -> Generator[lazy.LHDFObject, None, None]:
"""Please refer to the docstring of the find method of the ObjDB class"""
if isinstance(file_or_filename, (h5py.Group, h5py.Dataset)):
results = list(ObjDB(file_or_filename).find(*args, **kwargs))
for r in results:
yield r
return list(ObjDB(file_or_filename).find(*args, **kwargs))
# for r in results:
# yield r
else:
with h5py.File(file_or_filename, 'r') as h5:
results = list(ObjDB(h5).find(*args, **kwargs))
for r in results:
yield r
return results
# for r in results:
# yield r


class FilesDB(NonInsertableDatabaseInterface, HDF5DBInterface):
Expand Down Expand Up @@ -88,6 +103,6 @@ def find(self, *args, **kwargs) -> Generator[lazy.LHDFObject, None, None]:
"""Call find on all the files"""
for filename in self.filenames:
with h5py.File(filename, 'r') as h5:
ret = ObjDB(h5).find(*args, **kwargs)
for r in ret:
yield r
return ObjDB(h5).find(*args, **kwargs)
# for r in ret:
# yield r
11 changes: 0 additions & 11 deletions h5rdmtoolbox/database/hdfdb/nonsearchable.py

This file was deleted.

31 changes: 24 additions & 7 deletions h5rdmtoolbox/database/hdfdb/objdb.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import Union, Dict, List, Callable, Generator, Optional

import h5py
import numpy as np
from typing import Union, Dict, List, Callable, Generator, Optional

from . import query, utils
from .nonsearchable import NonInsertableDatabaseInterface
from .. import lazy
from ..template import HDF5DBInterface
from ..interface import HDF5DBInterface, NonInsertableDatabaseInterface


def basename(name: str) -> str:
Expand Down Expand Up @@ -426,6 +426,7 @@ def __init__(self, obj: Union[h5py.Dataset, h5py.Group]):
else:
raise TypeError(f'Unexpected type: {type(obj)}')
self.find = self._instance_find # allow `find` to be a static method and instance method
self.rdf_find = self._instance_rdf_find # allow `find` to be a static method and instance method
self.find_one = self._instance_find_one # allow `find_one` to be a static method and instance method

@staticmethod
Expand Down Expand Up @@ -473,7 +474,7 @@ def _instance_find(self,
flt: Union[Dict, str],
objfilter=None,
recursive: bool = True,
ignore_attribute_error: bool = False) -> Generator[lazy.LHDFObject, None, None]:
ignore_attribute_error: bool = False) -> List[lazy.LHDFObject]:
if isinstance(self.src_obj, h5py.Dataset) and recursive:
recursive = False
results = find(self.src_obj,
Expand All @@ -482,9 +483,25 @@ def _instance_find(self,
recursive=recursive,
find_one=False,
ignore_attribute_error=ignore_attribute_error)

for r in results:
yield r
return results

def _instance_rdf_find(self, *,
rdf_subject: Optional[str] = None,
rdf_type: Optional[str] = None,
rdf_predicate: Optional[str] = None,
rdf_object: Optional[str] = None,
recursive: bool = True) -> List[lazy.LHDFObject]:
"""Find objects based on rdf triples"""
import h5rdmtoolbox as h5tbx
if isinstance(self.src_obj, h5py.Group):
src_obj = h5tbx.Group(self.src_obj)
else:
src_obj = h5tbx.Dataset(self.src_obj)
return lazy.lazy(src_obj.rdf.find(rdf_subject=rdf_subject,
rdf_type=rdf_type,
rdf_predicate=rdf_predicate,
rdf_object=rdf_object,
recursive=recursive))

def distinct(self, key: str,
objfilter: Optional[Union[h5py.Group, h5py.Dataset]] = None):
Expand Down
3 changes: 2 additions & 1 deletion h5rdmtoolbox/database/hdfdb/query.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""query module"""
import logging
import numpy as np
import re
import warnings

import numpy as np

logger = logging.getLogger('h5rdmtoolbox')


Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
import abc
import h5py
from typing import Generator

import h5py

from .lazy import LHDFObject


class NonInsertableDatabaseInterface:
"""A database interface that does not allow inserting datasets"""

def insert_dataset(self, *args, **kwargs):
"""Insert a dataset. This is not possible for an HDF5 file."""
raise NotImplementedError('By using an HDF5 file as a database, you cannot insert datasets')

def insert_group(self, *args, **kwargs):
"""Insert a group. This is not possible for an HDF5 file."""
raise NotImplementedError('By using an HDF5 file as a database, you cannot insert groups')


class HDF5DBInterface(abc.ABC):
"""Abstract HDF5 Database interface.
Expand Down
2 changes: 1 addition & 1 deletion h5rdmtoolbox/database/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def _get_dataset_properties(h5obj, keys):


def lazy(h5obj: Union[List[Union[h5py.Group, h5py.Dataset, LHDFObject]],
h5py.Dataset, h5py.Group, LHDFObject]) -> Union[None, LDataset, LGroup]:
h5py.Dataset, h5py.Group, LHDFObject]) -> Union[None, List[LHDFObject], LHDFObject]:
"""Make a lazy object from a h5py object"""
if isinstance(h5obj, LHDFObject):
return h5obj
Expand Down
2 changes: 1 addition & 1 deletion h5rdmtoolbox/database/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import List, Dict, Any, Union, Generator

from . import lazy
from .template import HDF5DBInterface
from .interface import HDF5DBInterface
from .. import protected_attributes


Expand Down
11 changes: 8 additions & 3 deletions h5rdmtoolbox/tutorial.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""
Tutorial module providing easy access to particular data.
"""
import numpy as np
import os
import pathlib
import xarray as xr
from typing import List

import numpy as np
import xarray as xr
from rdflib import FOAF

import h5rdmtoolbox as h5tbx
from h5rdmtoolbox.convention.standard_names.table import StandardNameTable
from .utils import generate_temporary_directory
Expand Down Expand Up @@ -326,7 +328,10 @@ def generate_fluid_hdf_file() -> pathlib.Path:
with h5tbx.File() as h5:
h5.write_iso_timestamp(name='timestamp', dt=None) # writes the current date time in iso format to the attribute
h5.attrs['project'] = 'tutorial'
h5.attrs['contact'] = {'name': 'John Doe', 'surname': 'Doe'}
contact_grp = h5.create_group('contact')
contact_grp.attrs['name', FOAF.firstName] = 'John'
contact_grp.attrs['surname', FOAF.lastName] = 'Doe'

h5.attrs['check_value'] = 0
h5.create_dataset('pressure1', data=np.random.random(size=10) * 800,
attrs=dict(units='Pa', standard_name='pressure',
Expand Down
3 changes: 2 additions & 1 deletion h5rdmtoolbox/wrapper/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,8 @@ def find_one(self, flt: Union[Dict, str],
"""See ObjDB.find_one()"""
return ObjDB(self).find_one(flt, objfilter, recursive, ignore_attribute_error)

def find(self, flt: Union[Dict, str],
def find(self,
flt: Union[Dict, str],
objfilter: Union[str, h5py.Dataset, h5py.Group, None] = None,
recursive: bool = True,
ignore_attribute_error: bool = False) -> Generator[LHDFObject, None, None]:
Expand Down
Loading

0 comments on commit 8c3c525

Please sign in to comment.