diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 7eb17e2..b21af00 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -8,7 +8,7 @@ ARG USER_GID=$USER_UID RUN if [ "$USER_GID" != "1000" ] || [ "$USER_UID" != "1000" ]; then groupmod --gid $USER_GID vscode && usermod --uid $USER_UID --gid $USER_GID vscode; fi # [Option] Install Node.js -ARG INSTALL_NODE="true" +ARG INSTALL_NODE="false" ARG NODE_VERSION="lts/*" RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi diff --git a/.pyproject_generation/pyproject_custom.toml b/.pyproject_generation/pyproject_custom.toml index 6903989..021fdac 100644 --- a/.pyproject_generation/pyproject_custom.toml +++ b/.pyproject_generation/pyproject_custom.toml @@ -4,7 +4,7 @@ name = "metldata" version = "1.0.0" description = "metldata - A framework for handling metadata based on ETL, CQRS, and event sourcing." dependencies = [ - "schemapack == 2.0.0-alpha.3" + "schemapack == 2.0.0-alpha.4" ] [project.urls] diff --git a/.template/mandatory_files.txt b/.template/mandatory_files.txt index 660a15e..fcfb0f8 100644 --- a/.template/mandatory_files.txt +++ b/.template/mandatory_files.txt @@ -23,6 +23,7 @@ lock/requirements-dev.txt lock/requirements.txt Dockerfile +Dockerfile.debian config_schema.json example_config.yaml LICENSE diff --git a/.template/mandatory_files_ignore.txt b/.template/mandatory_files_ignore.txt index 156fcd6..5ddc433 100644 --- a/.template/mandatory_files_ignore.txt +++ b/.template/mandatory_files_ignore.txt @@ -8,7 +8,6 @@ scripts/script_utils/fastapi_app_location.py -Dockerfile config_schema.json example_config.yaml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0c01c4d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# BASE: a base image with updated packages +FROM python:3.12-alpine AS base +RUN apk upgrade --no-cache --available + +# BUILDER: a container to build the service wheel +FROM base AS builder +RUN pip install build +COPY . /service +WORKDIR /service +RUN python -m build + +# DEP-BUILDER: a container to (build and) install dependencies +FROM base AS dep-builder +RUN apk update +RUN apk add build-base gcc g++ libffi-dev zlib-dev +RUN apk upgrade --available +WORKDIR /service +COPY --from=builder /service/lock/requirements.txt /service +RUN pip install --no-deps -r requirements.txt + +# RUNNER: a container to run the service +FROM base AS runner +WORKDIR /service +RUN rm -rf /usr/local/lib/python3.12 +COPY --from=dep-builder /usr/local/lib/python3.12 /usr/local/lib/python3.12 +COPY --from=builder /service/dist/ /service +RUN pip install --no-deps *.whl +RUN rm *.whl +RUN adduser -D appuser +WORKDIR /home/appuser +USER appuser +ENV PYTHONUNBUFFERED=1 + +# Please adapt to package name: +ENTRYPOINT ["metldata"] diff --git a/Dockerfile.debian b/Dockerfile.debian new file mode 100644 index 0000000..4cd862b --- /dev/null +++ b/Dockerfile.debian @@ -0,0 +1,48 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## creating building container +FROM python:3.12-slim-bookworm AS builder +# update and install dependencies +RUN apt update +RUN apt upgrade -y +RUN pip install build +# copy code +COPY . /service +WORKDIR /service +# build wheel +RUN python -m build + +# creating running container +FROM python:3.12-slim-bookworm +# update and install dependencies +RUN apt update +RUN apt upgrade -y +# copy and install requirements and wheel +WORKDIR /service +COPY --from=builder /service/lock/requirements.txt /service +RUN pip install --no-deps -r requirements.txt +RUN rm requirements.txt +COPY --from=builder /service/dist/ /service +RUN pip install --no-deps *.whl +RUN rm *.whl +# create new user and execute as that user +RUN useradd --create-home appuser +WORKDIR /home/appuser +USER appuser +# set environment +ENV PYTHONUNBUFFERED=1 +# Please adapt to package name: +ENTRYPOINT ["metldata"] diff --git a/lock/requirements-dev-template.in b/lock/requirements-dev-template.in index dd81066..97ce387 100644 --- a/lock/requirements-dev-template.in +++ b/lock/requirements-dev-template.in @@ -1,7 +1,7 @@ # common requirements for development and testing of services pytest>=8.2 -pytest-asyncio>=0.23.6 +pytest-asyncio>=0.23.7 pytest-cov>=5 snakeviz>=2.2 logot>=1.3 @@ -29,4 +29,4 @@ setuptools>=69.5 # required since switch to pyproject.toml and pip-tools tomli_w>=1.0 -uv>=0.1.44 +uv>=0.2.13 diff --git a/pyproject.toml b/pyproject.toml index 40be2f4..d9ac0ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ name = "metldata" version = "1.0.0" description = "metldata - A framework for handling metadata based on ETL, CQRS, and event sourcing." dependencies = [ - "schemapack == 2.0.0-alpha.3", + "schemapack == 2.0.0-alpha.4", ] [project.license] diff --git a/src/metldata/builtin_transformations/infer_relations/path/__init__.py b/src/metldata/builtin_transformations/common/path/__init__.py similarity index 100% rename from src/metldata/builtin_transformations/infer_relations/path/__init__.py rename to src/metldata/builtin_transformations/common/path/__init__.py diff --git a/src/metldata/builtin_transformations/infer_relations/path/path.py b/src/metldata/builtin_transformations/common/path/path.py similarity index 98% rename from src/metldata/builtin_transformations/infer_relations/path/path.py rename to src/metldata/builtin_transformations/common/path/path.py index f6ac3cf..fefba31 100644 --- a/src/metldata/builtin_transformations/infer_relations/path/path.py +++ b/src/metldata/builtin_transformations/common/path/path.py @@ -18,7 +18,7 @@ from pydantic import GetJsonSchemaHandler, ValidationInfo -from metldata.builtin_transformations.infer_relations.path.path_str import ( +from metldata.builtin_transformations.common.path.path_str import ( PATH_PATTERN, ValidationError, clean_path_str, diff --git a/src/metldata/builtin_transformations/infer_relations/path/path_elements.py b/src/metldata/builtin_transformations/common/path/path_elements.py similarity index 100% rename from src/metldata/builtin_transformations/infer_relations/path/path_elements.py rename to src/metldata/builtin_transformations/common/path/path_elements.py diff --git a/src/metldata/builtin_transformations/infer_relations/path/path_str.py b/src/metldata/builtin_transformations/common/path/path_str.py similarity index 98% rename from src/metldata/builtin_transformations/infer_relations/path/path_str.py rename to src/metldata/builtin_transformations/common/path/path_str.py index 6d3e548..8cf26ec 100644 --- a/src/metldata/builtin_transformations/infer_relations/path/path_str.py +++ b/src/metldata/builtin_transformations/common/path/path_str.py @@ -18,7 +18,7 @@ import re -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) diff --git a/src/metldata/builtin_transformations/count_references/__init__.py b/src/metldata/builtin_transformations/count_references/__init__.py new file mode 100644 index 0000000..efe6907 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A transformation to count how often specific classes are referenced along given relation paths.""" + +# shortcuts: +from metldata.builtin_transformations.count_references.main import ( # noqa: F401 + COUNT_REFERENCES_TRANSFORMATION, +) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py new file mode 100644 index 0000000..b6df458 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -0,0 +1,176 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"Assumptions for count references transformation" + +from schemapack.spec.schemapack import SchemaPack + +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_schema_object_path, +) +from metldata.builtin_transformations.common.path.path import RelationPath +from metldata.builtin_transformations.common.path.path_elements import ( + RelationPathElement, + RelationPathElementType, +) +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) +from metldata.transform.base import ModelAssumptionError, MultiplicityError + + +def check_model_assumptions( + schema: SchemaPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], +) -> None: + """Check the model assumptions for the count references transformation.""" + for _, instructions in instructions_by_class.items(): + for instruction in instructions: + assert_only_direct_relations(instruction) + assert_class_is_source(instruction) + assert_path_classes_and_relations_exist( + schema, instruction.source_relation_path + ) + assert_multiplicity(schema, instruction.source_relation_path) + assert_object_path_exists(schema, instruction) + + +def assert_only_direct_relations(instruction: AddReferenceCountPropertyInstruction): + """Ensure that only direct relations are suppported which should be the case if the + relation path only contains one path element. + """ + num_elements = len(instruction.source_relation_path.elements) + if num_elements != 1: + raise ModelAssumptionError( + f"The provided relation path { + instruction.source_relation_path.path_str}" + f"does not describe a direct relation, but contains { + num_elements} different relations" + ) + + +def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction): + """Ensure that the class being modified with the reference count property is the expected class. + This function iterates over the elements of the relation path in the given instruction + and validates that the class being modified with the reference count property matches + the class specified in the relation path. + """ + for path_element in instruction.source_relation_path.elements: + _validate_modification_class(path_element, instruction.class_name) + + +def _validate_modification_class( + path_element: RelationPathElement, expected_class_name: str +): + """Check whether the class specified to be modified with the reference count + matches the source or target class in the provided `path_element`, depending on the + type of the relation path (i.e., active or passive). If the class does not match, + an exception is raised. + """ + modification_class_name = ( + path_element.source + if path_element.type_ == RelationPathElementType.ACTIVE + else path_element.target + ) + if expected_class_name != modification_class_name: + raise ModelAssumptionError( + f"Class { + expected_class_name} does not correspond to the relation source " + f"{modification_class_name}." + ) + + +def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPath): + """Make sure that all classes and relations defined in the provided path exist in + the provided model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + for path_element in path.elements: + _check_class_exists(model, path_element.source) + _check_class_exists(model, path_element.target) + + if path_element.type_ == RelationPathElementType.ACTIVE: + _check_relation_exists(model, path_element.source, path_element.property) + + if path_element.type_ == RelationPathElementType.PASSIVE: + _check_relation_exists(model, path_element.target, path_element.property) + + +def _check_class_exists(model: SchemaPack, class_name: str) -> None: + """Check if a class exists in the model and raise an error if not""" + if class_name not in model.classes: + raise ModelAssumptionError(f"Class {class_name} not found in model.") + + +def _check_relation_exists(model: SchemaPack, class_name: str, relation: str): + """Check if a relation exists in a class and raise an error if not""" + if relation not in model.classes[class_name].relations: + raise ModelAssumptionError( + f"Relation property { + relation} not found in class {class_name}." + ) + + +def assert_multiplicity(model: SchemaPack, path: RelationPath): + """Make sure the target of the relation contributes multiple instances to the relation.""" + for path_element in path.elements: + if path_element.type_ == RelationPathElementType.ACTIVE: + relation = model.classes[path_element.source].relations[ + path_element.property + ] + if not relation.multiple.target: + raise MultiplicityError( + f"The target of the relation { + path_element.property} does not contribute multiple instances to the relation." + ) + + +def assert_object_path_exists( + model: SchemaPack, + instruction: AddReferenceCountPropertyInstruction, +) -> None: + """Make sure that the source class (the class being modified) and the object_path exists in the model.""" + class_name = instruction.class_name + class_def = model.classes.get(class_name) + + # Check if the class exists in the model + if not class_def: + raise ModelAssumptionError(f"Class {class_name} does not exist in the model.") + + # Check if the object_path already exists in the model + try: + target_schema = resolve_schema_object_path( + json_schema=class_def.content.json_schema_dict, + path=instruction.target_content.object_path, + ) + except KeyError as exc: + raise ModelAssumptionError( + f"Object path { + instruction.target_content.object_path} does not exist" + + f" in class {class_name}." + ) from exc + + # Check if the propert_name already exists in the model + if instruction.target_content.property_name not in target_schema.get( + "properties", {} + ): + raise ModelAssumptionError( + f"Property { + instruction.target_content.property_name} does not exist" + + f" in class {class_name}." + ) diff --git a/src/metldata/builtin_transformations/count_references/config.py b/src/metldata/builtin_transformations/count_references/config.py new file mode 100644 index 0000000..96f4c50 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/config.py @@ -0,0 +1,50 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Models used to describe count content properties that shall be calculated and added.""" + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) + + +class CountReferencesConfig(BaseSettings): + """Config containing content properties to be deleted from models and data.""" + + model_config = SettingsConfigDict(extra="forbid") + + count_references: list[AddReferenceCountPropertyInstruction] = Field( + ..., + description=( + "A list of instructions describing for which class and" + + " corresponding relation path references should be counted." + ), + examples=[], + ) + + def instructions_by_class( + self, + ) -> dict[str, list[AddReferenceCountPropertyInstruction]]: + """Returns a dictionary of instructions by class (i.e. config for each class).""" + instructions_by_class: dict[ + str, list[AddReferenceCountPropertyInstruction] + ] = {} + for instruction in self.count_references: + instructions_by_class.setdefault(instruction.class_name, []).append( + instruction + ) + return instructions_by_class diff --git a/src/metldata/builtin_transformations/count_references/data_transform.py b/src/metldata/builtin_transformations/count_references/data_transform.py new file mode 100644 index 0000000..e0c67e5 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/data_transform.py @@ -0,0 +1,65 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Logic for transforming data.""" + +from schemapack.spec.datapack import DataPack + +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) +from metldata.transform.base import EvitableTransformationError + + +def count_references( + *, + data: DataPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], +) -> DataPack: + """Given a data pack and a dictionary of instructions by class, + counts the references and adds the value to its corresponding content property. + + Args: + data: + The datapack to add the reference count values. + instructions_by_class: + A dictionary mapping class names to lists of instructions. + + Returns: + The data with the reference counts added. + """ + modified_data = data.model_copy(deep=True) + for class_name, instructions in instructions_by_class.items(): + resources = modified_data.resources.get(class_name) + + if not resources: + raise EvitableTransformationError() + + for instruction in instructions: + for path_element in instruction.source_relation_path.elements: + relation_slot = path_element.property + + for resource in resources.values(): + related_to = resource.relations.get(relation_slot) + if not related_to: + raise EvitableTransformationError() + + count = len(related_to) if related_to else 0 + + resource.content[instruction.target_content.object_path].update( + {instruction.target_content.property_name: count} + ) + + return modified_data diff --git a/src/metldata/builtin_transformations/count_references/instruction.py b/src/metldata/builtin_transformations/count_references/instruction.py new file mode 100644 index 0000000..85d8bf6 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/instruction.py @@ -0,0 +1,42 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Models for instructions used in the 'add content properties' transformation.""" + +from pydantic import Field +from pydantic_settings import BaseSettings + +from metldata.builtin_transformations.common import NewContentSchemaPath +from metldata.builtin_transformations.common.path.path import RelationPath + + +class AddReferenceCountPropertyInstruction(BaseSettings): + """A model describing an instruction for adding a reference count property to the + content schema of a class. It defines the class to be modified, the target content + where the property will be added, and the relationship path that describes how the + classes are connected. + """ + + class_name: str = Field(..., description="The name of the class to modify.") + + target_content: NewContentSchemaPath = Field( + ..., + description="A NewContentSchemaPath that describes a path to an already" + + " existing object within the content schema and the name of a property to be" + + " added to that object's schema", + ) + source_relation_path: RelationPath = Field( + ..., + description="The path describing the relation between the classes of a metadata model.", + ) diff --git a/src/metldata/builtin_transformations/count_references/main.py b/src/metldata/builtin_transformations/count_references/main.py new file mode 100644 index 0000000..2d5d660 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/main.py @@ -0,0 +1,81 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A transformation to count references.""" + +from schemapack.spec.datapack import DataPack +from schemapack.spec.schemapack import SchemaPack + +from metldata.builtin_transformations.count_references.assumptions import ( + check_model_assumptions, +) +from metldata.builtin_transformations.count_references.config import ( + CountReferencesConfig, +) +from metldata.builtin_transformations.count_references.data_transform import ( + count_references, +) +from metldata.builtin_transformations.count_references.model_transform import ( + add_count_references, +) +from metldata.transform.base import DataTransformer, TransformationDefinition + + +class CountReferencesTransformer(DataTransformer[CountReferencesConfig]): + """A transformer that counts the references and adds them to content properties.""" + + def transform(self, data: DataPack) -> DataPack: + """Transforms data. + + Args: + data: The data as DataPack to be transformed. + """ + return count_references( + data=data, instructions_by_class=self._config.instructions_by_class() + ) + + +def check_model_assumptions_wrapper( + model: SchemaPack, config: CountReferencesConfig +) -> None: + """Check the assumptions of the model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + check_model_assumptions( + schema=model, instructions_by_class=config.instructions_by_class() + ) + + +def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaPack: + """Transform the data model. + + Raises: + DataModelTransformationError: + if the transformation fails. + """ + return add_count_references( + model=model, instructions_by_class=config.instructions_by_class() + ) + + +COUNT_REFERENCES_TRANSFORMATION = TransformationDefinition[CountReferencesConfig]( + config_cls=CountReferencesConfig, + check_model_assumptions=check_model_assumptions_wrapper, + transform_model=transform_model, + data_transformer_factory=CountReferencesTransformer, +) diff --git a/src/metldata/builtin_transformations/count_references/model_transform.py b/src/metldata/builtin_transformations/count_references/model_transform.py new file mode 100644 index 0000000..4a45a72 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/model_transform.py @@ -0,0 +1,59 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model transformation logic for the 'count references' transformation""" + +from schemapack.spec.schemapack import ( + SchemaPack, +) + +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_schema_object_path, +) +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) +from metldata.transform.base import EvitableTransformationError + + +def add_count_references( + *, + model: SchemaPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], +) -> SchemaPack: + """The content properties are added to the model with the 'add_content_properties + step of the workflow. Thus, this function applies no transformation. + It only checks for EvitableTransformationError. + """ + for class_name, cls_instructions in instructions_by_class.items(): + class_def = model.classes.get(class_name) + + if not class_def: + raise EvitableTransformationError() + + content_schema = class_def.content.json_schema_dict + + for cls_instruction in cls_instructions: + try: + resolve_schema_object_path( + content_schema, cls_instruction.target_content.object_path + ) + except KeyError as e: + raise EvitableTransformationError() from e + + if cls_instruction.target_content.property_name in content_schema.get( + "properties", {} + ): + raise EvitableTransformationError() + return model diff --git a/src/metldata/builtin_transformations/infer_relations/assumptions.py b/src/metldata/builtin_transformations/infer_relations/assumptions.py index fa4836e..a933cbd 100644 --- a/src/metldata/builtin_transformations/infer_relations/assumptions.py +++ b/src/metldata/builtin_transformations/infer_relations/assumptions.py @@ -18,10 +18,10 @@ from schemapack.spec.schemapack import SchemaPack -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElementType, ) from metldata.builtin_transformations.infer_relations.relations import ( diff --git a/src/metldata/builtin_transformations/infer_relations/data_transform.py b/src/metldata/builtin_transformations/infer_relations/data_transform.py index f941d24..27422f9 100644 --- a/src/metldata/builtin_transformations/infer_relations/data_transform.py +++ b/src/metldata/builtin_transformations/infer_relations/data_transform.py @@ -47,10 +47,10 @@ from schemapack.spec.custom_types import ResourceId from schemapack.spec.datapack import DataPack, Resource -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) diff --git a/src/metldata/builtin_transformations/infer_relations/model_transform.py b/src/metldata/builtin_transformations/infer_relations/model_transform.py index 41c9efa..3a32fa9 100644 --- a/src/metldata/builtin_transformations/infer_relations/model_transform.py +++ b/src/metldata/builtin_transformations/infer_relations/model_transform.py @@ -24,8 +24,8 @@ SchemaPack, ) -from metldata.builtin_transformations.infer_relations.path.path import RelationPath -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path import RelationPath +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) @@ -50,7 +50,7 @@ def get_relation(element: RelationPathElement, schema: SchemaPack) -> Relation: return schema.classes[class_name].relations[element.property] -def infer_mutiplicity_from_path( +def infer_multiplicity_from_path( path: RelationPath, schema: SchemaPack ) -> MultipleRelationSpec: """Infer the multiplicity of an inferred relation based on the path. @@ -140,7 +140,7 @@ def add_inferred_relations( raise EvitableTransformationError() mandatory = infer_mandatory_from_path(instruction.path, model) - multiple = infer_mutiplicity_from_path(instruction.path, model) + multiple = infer_multiplicity_from_path(instruction.path, model) new_relation = Relation.model_validate( { "targetClass": instruction.target, diff --git a/src/metldata/builtin_transformations/infer_relations/relations.py b/src/metldata/builtin_transformations/infer_relations/relations.py index 962089b..abff442 100644 --- a/src/metldata/builtin_transformations/infer_relations/relations.py +++ b/src/metldata/builtin_transformations/infer_relations/relations.py @@ -18,7 +18,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) diff --git a/src/metldata/transform/base.py b/src/metldata/transform/base.py index bbdcfd1..946867e 100644 --- a/src/metldata/transform/base.py +++ b/src/metldata/transform/base.py @@ -44,6 +44,16 @@ class ModelAssumptionError(RuntimeError): """Raised when assumptions made by transformation step about a model are not met.""" +class MultiplicityError(ModelAssumptionError): + """Raised when a relation in the model does not conform to the required multiplicity + constraints. It occurs when the actual cardinality of a relationship within a model + fails to meet the expected multiplicity criteria of a transformation. E.g., + in 'count references' transformation, the target of a relation is required to + contribute multiple instances(`target=True`) to the relation, and this error is raised + if that condition is not satisfied. + """ + + class ModelTransformationError(RuntimeError): """Raised when a transformation failed when applied to the schemapack-based model. This exception should only be raised when the error could not have been caught @@ -208,7 +218,8 @@ def validate_step_references( continue if step.input not in steps: raise ValueError( - f"Step {step.input} referenced in step {step_name} is not defined." + f"Step {step.input} referenced in step { + step_name} is not defined." ) if not step_with_no_input_found: @@ -232,7 +243,8 @@ def validate_artifact_references(cls, values): for artifact_name, step_name in artifacts.items(): if step_name not in steps: raise ValueError( - f"Step {step_name} referenced in artifact {artifact_name} is not defined." + f"Step {step_name} referenced in artifact { + artifact_name} is not defined." ) return values diff --git a/tests/builtin_transformations/infer_relations/__init__.py b/tests/builtin_transformations/common/__init__.py similarity index 100% rename from tests/builtin_transformations/infer_relations/__init__.py rename to tests/builtin_transformations/common/__init__.py diff --git a/tests/builtin_transformations/infer_relations/path/__init__.py b/tests/builtin_transformations/common/path/__init__.py similarity index 100% rename from tests/builtin_transformations/infer_relations/path/__init__.py rename to tests/builtin_transformations/common/path/__init__.py diff --git a/tests/builtin_transformations/infer_relations/path/test_config.py b/tests/builtin_transformations/common/path/test_config.py similarity index 96% rename from tests/builtin_transformations/infer_relations/path/test_config.py rename to tests/builtin_transformations/common/path/test_config.py index 803332a..4578dc3 100644 --- a/tests/builtin_transformations/infer_relations/path/test_config.py +++ b/tests/builtin_transformations/common/path/test_config.py @@ -16,12 +16,12 @@ """Test relations utils.""" +from metldata.builtin_transformations.common.path.path import ( + RelationPath, +) from metldata.builtin_transformations.infer_relations.config import ( RelationInferenceConfig, ) -from metldata.builtin_transformations.infer_relations.path.path import ( - RelationPath, -) from metldata.builtin_transformations.infer_relations.relations import ( InferenceInstruction, ) diff --git a/tests/builtin_transformations/infer_relations/path/test_path.py b/tests/builtin_transformations/common/path/test_path.py similarity index 96% rename from tests/builtin_transformations/infer_relations/path/test_path.py rename to tests/builtin_transformations/common/path/test_path.py index 0dce021..fbfa1d5 100644 --- a/tests/builtin_transformations/infer_relations/path/test_path.py +++ b/tests/builtin_transformations/common/path/test_path.py @@ -21,10 +21,10 @@ import pytest from pydantic import BaseModel -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) diff --git a/tests/builtin_transformations/infer_relations/path/test_path_str.py b/tests/builtin_transformations/common/path/test_path_str.py similarity index 98% rename from tests/builtin_transformations/infer_relations/path/test_path_str.py rename to tests/builtin_transformations/common/path/test_path_str.py index 3cb06ee..5baf13c 100644 --- a/tests/builtin_transformations/infer_relations/path/test_path_str.py +++ b/tests/builtin_transformations/common/path/test_path_str.py @@ -20,11 +20,11 @@ import pytest -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) -from metldata.builtin_transformations.infer_relations.path.path_str import ( +from metldata.builtin_transformations.common.path.path_str import ( ValidationError, extract_first_element, get_element_components, diff --git a/tests/fixtures/example_transformations/count_references/multiple/config.yaml b/tests/fixtures/example_transformations/count_references/multiple/config.yaml new file mode 100644 index 0000000..628aab9 --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/config.yaml @@ -0,0 +1,31 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +count_references: + - class_name: Dataset + target_content: + object_path: "file_summary" + property_name: "count" + source_relation_path: "Dataset(files)>File" + - class_name: Sample + target_content: + object_path: "file_summary" + property_name: "count" + source_relation_path: "File<(files)Sample" + - class_name: Experiment + target_content: + object_path: "sample_summary" + property_name: "count" + source_relation_path: "Experiment(samples)>Sample" diff --git a/tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml new file mode 100644 index 0000000..e75b5de --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml @@ -0,0 +1,58 @@ +datapack: 0.3.0 +resources: + File: + file_a: + content: + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + dac_contact: dac@example.org + file_summary: # <- + count: 0 + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + description: Some sample. + file_summary: # <- + count: 0 + relations: + files: + - file_a + - file_b + sample_y: + content: + file_summary: # <- + count: 0 + relations: + files: + - file_c + Experiment: + experiment_i: + content: + sample_summary: # <- + count: 0 + relations: + samples: + - sample_x + - sample_y diff --git a/tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml new file mode 100644 index 0000000..a800d6b --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml @@ -0,0 +1,97 @@ +# a simple schemapack: +schemapack: 0.3.0 +classes: + File: + id: + propertyName: alias + content: ../../../example_content_schemas/File.schema.json + Dataset: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A dataset that is a collection of files.", + "properties": + { + "dac_contact": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: true + target: true + mandatory: + origin: false + target: true + Sample: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A sample used to generate files in the context of an experiment.", + "properties": + { + "description": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: false + target: true + mandatory: + origin: false + target: true + Experiment: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "An experiment containing one or multiple samples.", + "properties": + { + "description": { "type": "string" }, + "sample_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + samples: + targetClass: Sample + multiple: + origin: false + target: true + mandatory: + origin: true + target: true diff --git a/tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml new file mode 100644 index 0000000..09aead8 --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml @@ -0,0 +1,58 @@ +datapack: 0.3.0 +resources: + File: + file_a: + content: + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + dac_contact: dac@example.org + file_summary: # <- + count: 3 + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + description: Some sample. + file_summary: # <- + count: 2 + relations: + files: + - file_a + - file_b + sample_y: + content: + file_summary: # <- + count: 1 + relations: + files: + - file_c + Experiment: + experiment_i: + content: + sample_summary: # <- + count: 2 + relations: + samples: + - sample_x + - sample_y diff --git a/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml new file mode 100644 index 0000000..a800d6b --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml @@ -0,0 +1,97 @@ +# a simple schemapack: +schemapack: 0.3.0 +classes: + File: + id: + propertyName: alias + content: ../../../example_content_schemas/File.schema.json + Dataset: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A dataset that is a collection of files.", + "properties": + { + "dac_contact": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: true + target: true + mandatory: + origin: false + target: true + Sample: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A sample used to generate files in the context of an experiment.", + "properties": + { + "description": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: false + target: true + mandatory: + origin: false + target: true + Experiment: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "An experiment containing one or multiple samples.", + "properties": + { + "description": { "type": "string" }, + "sample_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + samples: + targetClass: Sample + multiple: + origin: false + target: true + mandatory: + origin: true + target: true diff --git a/tests/fixtures/transformations.py b/tests/fixtures/transformations.py index c5a0eb8..51c7dc2 100644 --- a/tests/fixtures/transformations.py +++ b/tests/fixtures/transformations.py @@ -26,6 +26,9 @@ from metldata.builtin_transformations.add_content_properties import ( ADD_CONTENT_PROPERTIES_TRANSFORMATION, ) +from metldata.builtin_transformations.count_references import ( + COUNT_REFERENCES_TRANSFORMATION, +) from metldata.builtin_transformations.delete_properties import ( PROPERTY_DELETION_TRANSFORMATION, ) @@ -43,6 +46,7 @@ "infer_relations": RELATION_INFERENCE_TRANSFORMATION, "delete_properties": PROPERTY_DELETION_TRANSFORMATION, "add_content_properties": ADD_CONTENT_PROPERTIES_TRANSFORMATION, + "count_references": COUNT_REFERENCES_TRANSFORMATION, }