Skip to content

Commit

Permalink
Merge branch 'main' into colab
Browse files Browse the repository at this point in the history
  • Loading branch information
matthiasprobst committed Jan 4, 2024
2 parents 593e4b7 + cc7078b commit 57cf28a
Show file tree
Hide file tree
Showing 10 changed files with 335 additions and 29 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

Log of changes in the versions

## v1.2.1
- Add codemeta namespace
- Improved json-ld export
- Updated qudt namespace
- colab notebook will be managed on a separate branch. the readme link points to the branch

## v1.2.0
- Improved assignment of IRI to attributes
- Export of a JSON-LD file possible
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Current implementation highlights in the modules:

A quickstart notebook can be tested by clicking on the following badge:

[![Open Quickstart Notebook](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/matthiasprobst/h5RDMtoolbox/blob/main/docs/colab/quickstart.ipynb)
[![Open Quickstart Notebook](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/matthiasprobst/h5RDMtoolbox/blob/colab/docs/colab/quickstart.ipynb)

## Documentation

Expand Down
2 changes: 1 addition & 1 deletion codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"license": "https://spdx.org/licenses/MIT",
"codeRepository": "git+https://github.com/matthiasprobst/h5RDMtoolbox.git",
"name": "h5RDMtoolbox",
"version": "1.2.2",
"version": "1.2.1",
"description": "Supporting a FAIR Research Data lifecycle using Python and HDF5.",
"applicationCategory": "Engineering",
"programmingLanguage": [
Expand Down
3 changes: 2 additions & 1 deletion h5rdmtoolbox/namespace.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .wrapper.namespaces._codemeta_namespace import CODEMETA
from .wrapper.namespaces._m4i_namespace import M4I
from .wrapper.namespaces._obo_namespace import OBO
from .wrapper.namespaces._qudt_quantitykind_namespace import QUDT_QUANTITYKIND
from .wrapper.namespaces._qudt_unit_namespace import QUDT_UNIT

__all__ = ['M4I', 'OBO', 'QUDT_UNIT', 'QUDT_QUANTITYKIND']
__all__ = ['M4I', 'OBO', 'QUDT_UNIT', 'QUDT_QUANTITYKIND', 'CODEMETA']
9 changes: 6 additions & 3 deletions h5rdmtoolbox/wrapper/h5attr.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import ast
import h5py
import json
import numpy as np
import pint
import rdflib
from h5py._hl.base import with_phil
Expand Down Expand Up @@ -110,10 +111,13 @@ def _parse_return_value(_id, ret):
# might be a list object
try:
return ast.literal_eval(ret)
# return ast.literal_eval(ret.replace(' ', ', '))
except (ValueError, NameError, AttributeError):
return ret
return ret
return AttributeString(ret)
if isinstance(ret, np.ndarray) and ret.dtype.name == 'object':
return WrapperAttributeManager._parse_return_value(_id, str(ret.tolist()))
return ret

@with_phil
Expand All @@ -132,8 +136,8 @@ def create(self,
name,
data,
shape=None, dtype=None,
predicate: Union[str, rdflib.URIRef]=None,
object: Union[str, rdflib.URIRef]=None):
predicate: Union[str, rdflib.URIRef] = None,
object: Union[str, rdflib.URIRef] = None):
r = super().create(name,
utils.parse_object_for_attribute_setting(data),
shape, dtype)
Expand Down Expand Up @@ -286,7 +290,6 @@ def raw(self) -> "h5py.AttributeManager":
with phil:
return attrs.AttributeManager(self._parent)


# class IRIAttr:
# """Helper class to write attributes together with an IRI
#
Expand Down
98 changes: 80 additions & 18 deletions h5rdmtoolbox/wrapper/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,56 @@
from typing import Dict


def _merge_entries(entries: Dict, clean: bool = True) -> Dict:
_entries = entries.copy()

ids = list(entries.keys())

delete_candidates = []

for _id, entry in entries.items():
for k, v in entry.items():
if clean and len(entry) == 1:
# remove empty entry, Note, this could be a problem if the entry references elsewhere...
delete_candidates.append(_id)
continue
if k != '@id':
if isinstance(v, list):
if all([i in ids for i in v]):
_entries[_id][k] = [_entries.pop(i) for i in v]

elif v in ids:
_entries[_id][k] = _entries.pop(v)
if clean:
for dc in delete_candidates:
_entries.pop(dc, None)
return _entries


def dumpd(grp,
iri_only=False,
file_url="file://./",
recursive: bool = True) -> Dict:
file_url="",
recursive: bool = True,
compact: bool = False) -> Dict:
"""Dump a group or a dataset to to dict."""

if isinstance(grp, (str, pathlib.Path)):
from .core import File
with File(grp) as h5:
return dumpd(h5, iri_only, file_url, recursive=recursive)
return dumpd(h5, iri_only, file_url, recursive=recursive, compact=compact)

assert isinstance(grp, (h5py.Group, h5py.Dataset))

def _get_id(_grp):
stem = pathlib.Path(_grp.file.filename).stem
return file_url + stem + _grp.name
return file_url + 'grp:' + _grp.name

entries = []
entries = {}

def _get_dict(_name: str, node):
j = {"@id": _get_id(node)}
_id = node.attrs.get('@id', None)
if _id is None:
_id = _get_id(node)
j = {"@id": _id}
s = node.iri.subject
if s is not None:
j["@type"] = str(s)
Expand All @@ -35,37 +64,70 @@ def _get_dict(_name: str, node):
value = str(node.iri.object[k])
else:
if isinstance(v, (h5py.Group, h5py.Dataset)):
value = _get_id(v)
if '@id' in v.attrs:
value = v.attrs['@id']
else:
value = _get_id(v)
else:
value = str(v)
if isinstance(v, (list, tuple)):
value = [str(i) for i in v]
else:
value = str(v)
j[str(node.iri.predicate[k])] = value
else:
if not iri_only:
j[k] = str(v)
entries.append(j)
entries[_id] = j
# entries.append(j)

_get_dict(grp.name, grp)

if recursive and isinstance(grp, h5py.Group):
grp.visititems(_get_dict)
# return grp.visititems(_get_dict)

# merge entries. e.g. {"@id": "foo", "author": "gro:/123"} and {"@id": "grp:/123", "name": "MP"}
# -> {"@id": "foo", "author": {"name": "MP"}}
entries = _merge_entries(entries, clean=True)

if len(entries) == 1:
return {"@graph": entries[0]}
return {"@graph": entries}
keys = list(entries.keys())
jsonld_dict = {"@graph": entries[keys[0]]}
else:
jsonld_dict = {"@graph": list(entries.values())}

if compact:
from rdflib import Graph
g = Graph().parse(data=json.dumps(jsonld_dict), format='json-ld')
return json.loads(g.serialize(format='json-ld', indent=2, compact=True))

return jsonld_dict


def dumps(grp, iri_only=False,
file_url="file://./",
file_url="",
recursive: bool = True,
**kwargs):
compact: bool = False,
**kwargs) -> str:
"""Dump a group or a dataset to to string."""
return json.dumps(dumpd(grp=grp, iri_only=iri_only, file_url=file_url, recursive=recursive), **kwargs)
return json.dumps(dumpd(
grp=grp, iri_only=iri_only, file_url=file_url, recursive=recursive, compact=compact),
**kwargs
)


def dump(grp,
fp,
iri_only=False,
file_url="file://./",
recursive: bool = True):
file_url="",
recursive: bool = True,
compact: bool = False,
**kwargs):
"""Dump a group or a dataset to to file."""
return json.dump(dumpd(grp, iri_only, file_url, recursive), fp, indent=4)
return json.dump(
dumpd(
grp, iri_only, file_url, recursive=recursive, compact=compact
),
fp,
**kwargs
)
53 changes: 49 additions & 4 deletions h5rdmtoolbox/wrapper/namespaces/_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import requests
import warnings

from rdflib import Graph

__this_dir__ = pathlib.Path(__file__).parent


Expand Down Expand Up @@ -57,7 +59,6 @@ def generate_namespace_file(namespace: str):

def generate_qudt_unit_namespace():
"""Generate the qudt namespace."""
from rdflib import Graph

namespace = 'qudt_unit'

Expand Down Expand Up @@ -87,7 +88,6 @@ def generate_qudt_unit_namespace():

def generate_qudt_quantitykind_namespace():
"""Generate the qudt namespace."""
from rdflib import Graph

namespace = 'qudt_quantitykind'

Expand Down Expand Up @@ -115,8 +115,53 @@ def generate_qudt_quantitykind_namespace():
f.write('\n\nQUDT_QUANTITYKIND = _QUDT_QUANTITYKIND()')


def generate_codemeta_namespace():
namespace = 'codemeta'
source = 'https://raw.githubusercontent.com/codemeta/codemeta/2.0/codemeta.jsonld'
context_file = __this_dir__ / f'_{namespace}.jsonld'
if not context_file.exists():
with open(context_file, 'w', encoding='utf-8') as f:
f.write(requests.get(source).text, )

g = Graph().parse(source, format='json-ld')
compact_context = json.loads(g.serialize(format='json-ld', indent=4, auto_compact=True))

with open(context_file) as f:
context = json.load(f)

uri_refs = {}
for k, v in context['@context'].items():
if k not in ('type', 'id'):
if '@id' in v:
if ':' in v['@id']:
_context, value = v['@id'].split(':', 1)
_expanded_context = compact_context['@context'][_context]
uri = _expanded_context + value
else:
uri = v['@id']
uri_refs[k] = uri

with open(__this_dir__ / f'_{namespace}_namespace.py', 'w',
encoding='UTF8') as f:
f.write('# automatically generated from https://codemeta.github.io/terms/\n')
f.write('from rdflib.namespace import Namespace\n')
f.write('from rdflib.term import URIRef\n\n\n')
f.write('class _CODEMETA:')

for k, v in uri_refs.items():
f.write(f'\n {k} = URIRef("{v}")')

f.write('\n\n _NS = Namespace("https://codemeta.github.io/terms/")')

f.write('\n\n')
f.write('\n\nCODEMETA = _CODEMETA()')

pathlib.Path(context_file).unlink(missing_ok=True)


if __name__ == '__main__':
# generate_namespace_file('m4i') # be careful, german lines must be manually uncommented
# generate_namespace_file('obo') # be careful, german lines must be manually uncommented
generate_qudt_unit_namespace() # write _qudt_namespace.py manually
generate_qudt_quantitykind_namespace() # write _qudt_quantitykind_namespace.py manually
# generate_qudt_unit_namespace() # write _qudt_namespace.py manually
# generate_qudt_quantitykind_namespace() # write _qudt_quantitykind_namespace.py manually
generate_codemeta_namespace()
82 changes: 82 additions & 0 deletions h5rdmtoolbox/wrapper/namespaces/_codemeta_namespace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# automatically generated from https://codemeta.github.io/terms/
from rdflib.namespace import Namespace
from rdflib.term import URIRef


class _CODEMETA:
Organization = URIRef("https://schema.org/Organization")
Person = URIRef("https://schema.org/Person")
SoftwareSourceCode = URIRef("https://schema.org/SoftwareSourceCode")
SoftwareApplication = URIRef("https://schema.org/SoftwareApplication")
Text = URIRef("https://schema.org/Text")
URL = URIRef("https://schema.org/URL")
address = URIRef("https://schema.org/address")
affiliation = URIRef("https://schema.org/affiliation")
applicationCategory = URIRef("https://schema.org/applicationCategory")
applicationSubCategory = URIRef("https://schema.org/applicationSubCategory")
citation = URIRef("https://schema.org/citation")
codeRepository = URIRef("https://schema.org/codeRepository")
contributor = URIRef("https://schema.org/contributor")
copyrightHolder = URIRef("https://schema.org/copyrightHolder")
copyrightYear = URIRef("https://schema.org/copyrightYear")
creator = URIRef("https://schema.org/creator")
dateCreated = URIRef("https://schema.org/dateCreated")
dateModified = URIRef("https://schema.org/dateModified")
datePublished = URIRef("https://schema.org/datePublished")
description = URIRef("https://schema.org/description")
downloadUrl = URIRef("https://schema.org/downloadUrl")
email = URIRef("https://schema.org/email")
editor = URIRef("https://schema.org/editor")
encoding = URIRef("https://schema.org/encoding")
familyName = URIRef("https://schema.org/familyName")
fileFormat = URIRef("https://schema.org/fileFormat")
fileSize = URIRef("https://schema.org/fileSize")
funder = URIRef("https://schema.org/funder")
givenName = URIRef("https://schema.org/givenName")
hasPart = URIRef("https://schema.org/hasPart")
identifier = URIRef("https://schema.org/identifier")
installUrl = URIRef("https://schema.org/installUrl")
isAccessibleForFree = URIRef("https://schema.org/isAccessibleForFree")
isPartOf = URIRef("https://schema.org/isPartOf")
keywords = URIRef("https://schema.org/keywords")
license = URIRef("https://schema.org/license")
memoryRequirements = URIRef("https://schema.org/memoryRequirements")
name = URIRef("https://schema.org/name")
operatingSystem = URIRef("https://schema.org/operatingSystem")
permissions = URIRef("https://schema.org/permissions")
position = URIRef("https://schema.org/position")
processorRequirements = URIRef("https://schema.org/processorRequirements")
producer = URIRef("https://schema.org/producer")
programmingLanguage = URIRef("https://schema.org/programmingLanguage")
provider = URIRef("https://schema.org/provider")
publisher = URIRef("https://schema.org/publisher")
relatedLink = URIRef("https://schema.org/relatedLink")
releaseNotes = URIRef("https://schema.org/releaseNotes")
runtimePlatform = URIRef("https://schema.org/runtimePlatform")
sameAs = URIRef("https://schema.org/sameAs")
softwareHelp = URIRef("https://schema.org/softwareHelp")
softwareRequirements = URIRef("https://schema.org/softwareRequirements")
softwareVersion = URIRef("https://schema.org/softwareVersion")
sponsor = URIRef("https://schema.org/sponsor")
storageRequirements = URIRef("https://schema.org/storageRequirements")
supportingData = URIRef("https://schema.org/supportingData")
targetProduct = URIRef("https://schema.org/targetProduct")
url = URIRef("https://schema.org/url")
version = URIRef("https://schema.org/version")
author = URIRef("https://schema.org/author")
softwareSuggestions = URIRef("https://codemeta.github.io/terms/softwareSuggestions")
contIntegration = URIRef("https://codemeta.github.io/terms/contIntegration")
buildInstructions = URIRef("https://codemeta.github.io/terms/buildInstructions")
developmentStatus = URIRef("https://codemeta.github.io/terms/developmentStatus")
embargoDate = URIRef("https://codemeta.github.io/terms/embargoDate")
funding = URIRef("https://codemeta.github.io/terms/funding")
readme = URIRef("https://codemeta.github.io/terms/readme")
issueTracker = URIRef("https://codemeta.github.io/terms/issueTracker")
referencePublication = URIRef("https://codemeta.github.io/terms/referencePublication")
maintainer = URIRef("https://codemeta.github.io/terms/maintainer")

_NS = Namespace("https://codemeta.github.io/terms/")



CODEMETA = _CODEMETA()
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = h5rdmtoolbox
version = 1.2.2
version = 1.2.1
author = Matthias Probst
author_email = matthias.probst@kit.edu
description = Supporting a FAIR Research Data lifecycle using Python and HDF5.
Expand Down
Loading

0 comments on commit 57cf28a

Please sign in to comment.