Skip to content

Commit

Permalink
Count references (#77)
Browse files Browse the repository at this point in the history
* model and data transformation for count references

* template update

* Assumption reordering (#78)

---------

Co-authored-by: Thomas Zajac <thomas-jakob.zajac@uni-tuebingen.de>
  • Loading branch information
sbilge and mephenor authored Aug 28, 2024
1 parent c8ce093 commit 080432e
Show file tree
Hide file tree
Showing 35 changed files with 974 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ ARG USER_GID=$USER_UID
RUN if [ "$USER_GID" != "1000" ] || [ "$USER_UID" != "1000" ]; then groupmod --gid $USER_GID vscode && usermod --uid $USER_UID --gid $USER_GID vscode; fi

# [Option] Install Node.js
ARG INSTALL_NODE="true"
ARG INSTALL_NODE="false"
ARG NODE_VERSION="lts/*"
RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi

Expand Down
2 changes: 1 addition & 1 deletion .pyproject_generation/pyproject_custom.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ name = "metldata"
version = "1.0.0"
description = "metldata - A framework for handling metadata based on ETL, CQRS, and event sourcing."
dependencies = [
"schemapack == 2.0.0-alpha.3"
"schemapack == 2.0.0-alpha.4"
]

[project.urls]
Expand Down
1 change: 1 addition & 0 deletions .template/mandatory_files.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ lock/requirements-dev.txt
lock/requirements.txt

Dockerfile
Dockerfile.debian
config_schema.json
example_config.yaml
LICENSE
Expand Down
1 change: 0 additions & 1 deletion .template/mandatory_files_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

scripts/script_utils/fastapi_app_location.py

Dockerfile
config_schema.json
example_config.yaml

Expand Down
50 changes: 50 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# BASE: a base image with updated packages
FROM python:3.12-alpine AS base
RUN apk upgrade --no-cache --available

# BUILDER: a container to build the service wheel
FROM base AS builder
RUN pip install build
COPY . /service
WORKDIR /service
RUN python -m build

# DEP-BUILDER: a container to (build and) install dependencies
FROM base AS dep-builder
RUN apk update
RUN apk add build-base gcc g++ libffi-dev zlib-dev
RUN apk upgrade --available
WORKDIR /service
COPY --from=builder /service/lock/requirements.txt /service
RUN pip install --no-deps -r requirements.txt

# RUNNER: a container to run the service
FROM base AS runner
WORKDIR /service
RUN rm -rf /usr/local/lib/python3.12
COPY --from=dep-builder /usr/local/lib/python3.12 /usr/local/lib/python3.12
COPY --from=builder /service/dist/ /service
RUN pip install --no-deps *.whl
RUN rm *.whl
RUN adduser -D appuser
WORKDIR /home/appuser
USER appuser
ENV PYTHONUNBUFFERED=1

# Please adapt to package name:
ENTRYPOINT ["metldata"]
48 changes: 48 additions & 0 deletions Dockerfile.debian
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## creating building container
FROM python:3.12-slim-bookworm AS builder
# update and install dependencies
RUN apt update
RUN apt upgrade -y
RUN pip install build
# copy code
COPY . /service
WORKDIR /service
# build wheel
RUN python -m build

# creating running container
FROM python:3.12-slim-bookworm
# update and install dependencies
RUN apt update
RUN apt upgrade -y
# copy and install requirements and wheel
WORKDIR /service
COPY --from=builder /service/lock/requirements.txt /service
RUN pip install --no-deps -r requirements.txt
RUN rm requirements.txt
COPY --from=builder /service/dist/ /service
RUN pip install --no-deps *.whl
RUN rm *.whl
# create new user and execute as that user
RUN useradd --create-home appuser
WORKDIR /home/appuser
USER appuser
# set environment
ENV PYTHONUNBUFFERED=1
# Please adapt to package name:
ENTRYPOINT ["metldata"]
4 changes: 2 additions & 2 deletions lock/requirements-dev-template.in
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# common requirements for development and testing of services

pytest>=8.2
pytest-asyncio>=0.23.6
pytest-asyncio>=0.23.7
pytest-cov>=5
snakeviz>=2.2
logot>=1.3
Expand Down Expand Up @@ -29,4 +29,4 @@ setuptools>=69.5
# required since switch to pyproject.toml and pip-tools
tomli_w>=1.0

uv>=0.1.44
uv>=0.2.13
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ name = "metldata"
version = "1.0.0"
description = "metldata - A framework for handling metadata based on ETL, CQRS, and event sourcing."
dependencies = [
"schemapack == 2.0.0-alpha.3",
"schemapack == 2.0.0-alpha.4",
]

[project.license]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from pydantic import GetJsonSchemaHandler, ValidationInfo

from metldata.builtin_transformations.infer_relations.path.path_str import (
from metldata.builtin_transformations.common.path.path_str import (
PATH_PATTERN,
ValidationError,
clean_path_str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import re

from metldata.builtin_transformations.infer_relations.path.path_elements import (
from metldata.builtin_transformations.common.path.path_elements import (
RelationPathElement,
RelationPathElementType,
)
Expand Down
20 changes: 20 additions & 0 deletions src/metldata/builtin_transformations/count_references/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A transformation to count how often specific classes are referenced along given relation paths."""

# shortcuts:
from metldata.builtin_transformations.count_references.main import ( # noqa: F401
COUNT_REFERENCES_TRANSFORMATION,
)
176 changes: 176 additions & 0 deletions src/metldata/builtin_transformations/count_references/assumptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"Assumptions for count references transformation"

from schemapack.spec.schemapack import SchemaPack

from metldata.builtin_transformations.add_content_properties.path import (
resolve_schema_object_path,
)
from metldata.builtin_transformations.common.path.path import RelationPath
from metldata.builtin_transformations.common.path.path_elements import (
RelationPathElement,
RelationPathElementType,
)
from metldata.builtin_transformations.count_references.instruction import (
AddReferenceCountPropertyInstruction,
)
from metldata.transform.base import ModelAssumptionError, MultiplicityError


def check_model_assumptions(
schema: SchemaPack,
instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]],
) -> None:
"""Check the model assumptions for the count references transformation."""
for _, instructions in instructions_by_class.items():
for instruction in instructions:
assert_only_direct_relations(instruction)
assert_class_is_source(instruction)
assert_path_classes_and_relations_exist(
schema, instruction.source_relation_path
)
assert_multiplicity(schema, instruction.source_relation_path)
assert_object_path_exists(schema, instruction)


def assert_only_direct_relations(instruction: AddReferenceCountPropertyInstruction):
"""Ensure that only direct relations are suppported which should be the case if the
relation path only contains one path element.
"""
num_elements = len(instruction.source_relation_path.elements)
if num_elements != 1:
raise ModelAssumptionError(
f"The provided relation path {
instruction.source_relation_path.path_str}"
f"does not describe a direct relation, but contains {
num_elements} different relations"
)


def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction):
"""Ensure that the class being modified with the reference count property is the expected class.
This function iterates over the elements of the relation path in the given instruction
and validates that the class being modified with the reference count property matches
the class specified in the relation path.
"""
for path_element in instruction.source_relation_path.elements:
_validate_modification_class(path_element, instruction.class_name)


def _validate_modification_class(
path_element: RelationPathElement, expected_class_name: str
):
"""Check whether the class specified to be modified with the reference count
matches the source or target class in the provided `path_element`, depending on the
type of the relation path (i.e., active or passive). If the class does not match,
an exception is raised.
"""
modification_class_name = (
path_element.source
if path_element.type_ == RelationPathElementType.ACTIVE
else path_element.target
)
if expected_class_name != modification_class_name:
raise ModelAssumptionError(
f"Class {
expected_class_name} does not correspond to the relation source "
f"{modification_class_name}."
)


def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPath):
"""Make sure that all classes and relations defined in the provided path exist in
the provided model.
Raises:
ModelAssumptionError:
if the model does not fulfill the assumptions.
"""
for path_element in path.elements:
_check_class_exists(model, path_element.source)
_check_class_exists(model, path_element.target)

if path_element.type_ == RelationPathElementType.ACTIVE:
_check_relation_exists(model, path_element.source, path_element.property)

if path_element.type_ == RelationPathElementType.PASSIVE:
_check_relation_exists(model, path_element.target, path_element.property)


def _check_class_exists(model: SchemaPack, class_name: str) -> None:
"""Check if a class exists in the model and raise an error if not"""
if class_name not in model.classes:
raise ModelAssumptionError(f"Class {class_name} not found in model.")


def _check_relation_exists(model: SchemaPack, class_name: str, relation: str):
"""Check if a relation exists in a class and raise an error if not"""
if relation not in model.classes[class_name].relations:
raise ModelAssumptionError(
f"Relation property {
relation} not found in class {class_name}."
)


def assert_multiplicity(model: SchemaPack, path: RelationPath):
"""Make sure the target of the relation contributes multiple instances to the relation."""
for path_element in path.elements:
if path_element.type_ == RelationPathElementType.ACTIVE:
relation = model.classes[path_element.source].relations[
path_element.property
]
if not relation.multiple.target:
raise MultiplicityError(
f"The target of the relation {
path_element.property} does not contribute multiple instances to the relation."
)


def assert_object_path_exists(
model: SchemaPack,
instruction: AddReferenceCountPropertyInstruction,
) -> None:
"""Make sure that the source class (the class being modified) and the object_path exists in the model."""
class_name = instruction.class_name
class_def = model.classes.get(class_name)

# Check if the class exists in the model
if not class_def:
raise ModelAssumptionError(f"Class {class_name} does not exist in the model.")

# Check if the object_path already exists in the model
try:
target_schema = resolve_schema_object_path(
json_schema=class_def.content.json_schema_dict,
path=instruction.target_content.object_path,
)
except KeyError as exc:
raise ModelAssumptionError(
f"Object path {
instruction.target_content.object_path} does not exist"
+ f" in class {class_name}."
) from exc

# Check if the propert_name already exists in the model
if instruction.target_content.property_name not in target_schema.get(
"properties", {}
):
raise ModelAssumptionError(
f"Property {
instruction.target_content.property_name} does not exist"
+ f" in class {class_name}."
)
Loading

0 comments on commit 080432e

Please sign in to comment.