diff --git a/src/metldata/builtin_transformations/add_content_properties/assumptions.py b/src/metldata/builtin_transformations/add_content_properties/assumptions.py new file mode 100644 index 0000000..a6732bd --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/assumptions.py @@ -0,0 +1,57 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Check model assumptions for the add content properties transformation.""" + +from schemapack.spec.schemapack import SchemaPack + +from metldata.builtin_transformations.add_content_properties.instruction import ( + AddContentPropertyInstruction, +) +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_schema_object_path, +) +from metldata.transform.base import ModelAssumptionError + + +def check_model_assumptions( + schema: SchemaPack, + instructions_by_class: dict[str, list[AddContentPropertyInstruction]], +) -> None: + """Check the model assumptions for the add content properties transformation.""" + for class_name, instructions in instructions_by_class.items(): + class_def = schema.classes.get(class_name) + + # Check if the class exists in the model + if not class_def: + raise ModelAssumptionError( + f"Class {class_name} does not exist in the model." + ) + + for instruction in instructions: + # Check if the property already exists in the target schema + try: + target_schema = resolve_schema_object_path( + json_schema=class_def.content.json_schema_dict, + path=instruction.target_content.object_path, + ) + except KeyError: + continue + if instruction.target_content.property_name in target_schema.get( + "properties", {} + ): + raise ModelAssumptionError( + f"Property {instruction.target_content.property_name} already exists" + + f" in class {class_name}." + ) diff --git a/src/metldata/builtin_transformations/add_content_properties/config.py b/src/metldata/builtin_transformations/add_content_properties/config.py new file mode 100644 index 0000000..5b0090f --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/config.py @@ -0,0 +1,50 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Models used to describe content properties that shall be deleted.""" + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + +from metldata.builtin_transformations.add_content_properties.instruction import ( + AddContentPropertyInstruction, +) + + +class AddContentPropertiesConfig(BaseSettings): + """A Config for a transformation that adds a new property to an object within a + content schema + """ + + model_config = SettingsConfigDict(extra="forbid") + + add_content_properties: list[AddContentPropertyInstruction] = Field( + ..., + description=( + "A list of instructions to add content properties to the model and data." + ), + ) + + def instructions_by_class( + self, + ) -> dict[str, list[AddContentPropertyInstruction]]: + """Returns a dictionary of instructions by class.""" + instructions_by_class: dict[str, list[AddContentPropertyInstruction]] = {} + for instruction in self.add_content_properties: + instructions_by_class.setdefault(instruction.class_name, []).append( + instruction + ) + return instructions_by_class diff --git a/src/metldata/builtin_transformations/add_content_properties/data_transform.py b/src/metldata/builtin_transformations/add_content_properties/data_transform.py new file mode 100644 index 0000000..9e23060 --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/data_transform.py @@ -0,0 +1,73 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data transformation logic for the add content properties transformation.""" + +from copy import deepcopy + +from schemapack.spec.datapack import DataPack + +from metldata.builtin_transformations.add_content_properties.instruction import ( + AddContentPropertyInstruction, +) +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_data_object_path, +) +from metldata.transform.base import EvitableTransformationError + + +def add_properties( + *, + data: DataPack, + instructions_by_class: dict[str, list[AddContentPropertyInstruction]], +) -> DataPack: + """Given a data pack and a dictionary of instructions by class, add the specified + content properties to the data. + + Args: + data: + The datapack to add the content properties to. + instructions_by_class: + A dictionary mapping class names to lists of instructions. + + Returns: + The data with the specified content properties being added. + """ + modified_data = data.model_copy(deep=True) + + for class_name, instructions in instructions_by_class.items(): + resources = modified_data.resources.get(class_name) + + if not resources: + raise EvitableTransformationError() + + for resource in resources.values(): + for instruction in instructions: + content = resource.content + object = resolve_data_object_path( + data=content, + path=instruction.target_content.object_path, + ) + + if ( + not isinstance(object, dict) + or instruction.target_content.property_name in object + ): + raise EvitableTransformationError() + + object[instruction.target_content.property_name] = deepcopy( + instruction.value + ) + + return modified_data diff --git a/src/metldata/builtin_transformations/add_content_properties/instruction.py b/src/metldata/builtin_transformations/add_content_properties/instruction.py new file mode 100644 index 0000000..5b56771 --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/instruction.py @@ -0,0 +1,72 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Models for instructions used in the 'add content properties' transformation.""" + +from typing import Any, Final + +from pydantic import Field +from pydantic_settings import BaseSettings + +DEFAULT_CONTENT_SCHEMA: Final[dict[str, Any]] = { + "type": "object", + "additionalProperties": False, +} + + +class NewContentSchemaPath(BaseSettings): + """A model describing the path of an object property within the content schema that + is yet to be added. The model comprises a path to an already existing object within + the content schema and the name of a property to be added to that object's schema + """ + + object_path: str = Field( + ..., + description=( + "The path to the content object to which a property shall be added. The" + + " path must be specified in dot notation, equivalently to JavaScript" + + " property accessors." + ), + examples=["some_property.another_nested_property"], + ) + + property_name: str = Field(..., description="The name of the property to be added.") + + +class AddContentPropertyInstruction(BaseSettings): + """A model describing an instruction to add a new content property to a class in a + schemapack, including an associated default value in corresponding data. + """ + + class_name: str = Field(..., description="The name of the class to modify.") + + target_content: NewContentSchemaPath + + required: bool = Field( + True, + description=( + "Indicates whether the newly added property shall be added to the" + + " 'required' list of the corresponding object. Defaults to 'True'." + ), + ) + + content_schema: dict[str, Any] = Field( + DEFAULT_CONTENT_SCHEMA, + description="The JSON schema of the newly added property.", + ) + + value: Any = Field( + {}, description="A value to assign to the new property in the data." + ) diff --git a/src/metldata/builtin_transformations/add_content_properties/main.py b/src/metldata/builtin_transformations/add_content_properties/main.py new file mode 100644 index 0000000..a3150c8 --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/main.py @@ -0,0 +1,85 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""A transformation to add content properties.""" + +from schemapack.spec.datapack import DataPack +from schemapack.spec.schemapack import SchemaPack + +from metldata.builtin_transformations.add_content_properties.assumptions import ( + check_model_assumptions, +) +from metldata.builtin_transformations.add_content_properties.config import ( + AddContentPropertiesConfig, +) +from metldata.builtin_transformations.add_content_properties.data_transform import ( + add_properties, +) +from metldata.builtin_transformations.add_content_properties.model_transform import ( + add_content_properties, +) +from metldata.transform.base import ( + DataTransformer, + TransformationDefinition, +) + + +class AddContentPropertiesTransformer(DataTransformer[AddContentPropertiesConfig]): + """A transformer that deletes content properties from data.""" + + def transform(self, data: DataPack) -> DataPack: + """Transforms data. + + Args: + data: The data as DataPack to be transformed. + """ + return add_properties( + data=data, instructions_by_class=self._config.instructions_by_class() + ) + + +def check_model_assumptions_wrapper( + model: SchemaPack, + config: AddContentPropertiesConfig, +) -> None: + """Check the assumptions of the model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + check_model_assumptions( + schema=model, instructions_by_class=config.instructions_by_class() + ) + + +def transform_model( + model: SchemaPack, config: AddContentPropertiesConfig +) -> SchemaPack: + """Transform the data model.""" + return add_content_properties( + model=model, instructions_by_class=config.instructions_by_class() + ) + + +ADD_CONTENT_PROPERTIES_TRANSFORMATION = TransformationDefinition[ + AddContentPropertiesConfig +]( + config_cls=AddContentPropertiesConfig, + check_model_assumptions=check_model_assumptions_wrapper, + transform_model=transform_model, + data_transformer_factory=AddContentPropertiesTransformer, +) diff --git a/src/metldata/builtin_transformations/add_content_properties/model_transform.py b/src/metldata/builtin_transformations/add_content_properties/model_transform.py new file mode 100644 index 0000000..d0378be --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/model_transform.py @@ -0,0 +1,76 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model transformation logic for the 'add content property' transformation""" + +from copy import deepcopy + +from schemapack.spec.schemapack import ( + ClassDefinition, + SchemaPack, +) + +from metldata.builtin_transformations.add_content_properties.instruction import ( + AddContentPropertyInstruction, +) +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_schema_object_path, +) +from metldata.transform.base import EvitableTransformationError + + +def add_content_properties( + *, + model: SchemaPack, + instructions_by_class: dict[str, list[AddContentPropertyInstruction]], +) -> SchemaPack: + """Adds a new content property to the provided model.""" + updated_class_defs: dict[str, ClassDefinition] = {} + for class_name, cls_instructions in instructions_by_class.items(): + class_def = model.classes.get(class_name) + + if not class_def: + raise EvitableTransformationError() + + content_schema = class_def.content.json_schema_dict + + for cls_instruction in cls_instructions: + try: + target_object = resolve_schema_object_path( + content_schema, cls_instruction.target_content.object_path + ) + except KeyError as e: + raise EvitableTransformationError() from e + + if cls_instruction.target_content.property_name in content_schema.get( + "properties", {} + ): + raise EvitableTransformationError() + + target_object.setdefault("properties", {})[ + cls_instruction.target_content.property_name + ] = deepcopy(cls_instruction.content_schema) + + if cls_instruction.required: + target_object.setdefault("required", []).append( + cls_instruction.target_content.property_name + ) + + updated_class_defs[class_name] = class_def.model_validate( + {**class_def.model_dump(), "content": content_schema} + ) + + model_dict = model.model_dump() + model_dict["classes"].update(updated_class_defs) + return SchemaPack.model_validate(model_dict) diff --git a/src/metldata/builtin_transformations/add_content_properties/path.py b/src/metldata/builtin_transformations/add_content_properties/path.py new file mode 100644 index 0000000..5f7acc7 --- /dev/null +++ b/src/metldata/builtin_transformations/add_content_properties/path.py @@ -0,0 +1,64 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Helper function for dot-separated strings describing property paths in JSON +schemas. +""" + +from collections.abc import Mapping +from typing import Any + + +def resolve_schema_object_path(json_schema: Mapping[str, Any], path: str) -> Any: + """Given a JSON schema describing an object, resolve the dot-separated path to a + property. Return the property schema. + + Args: + json_schema: + The JSON schema of the object. + path: + The dot-separated path to the property. + + Raises: + KeyError: + If the path does not exist in the schema. + + Returns: The schema of the property at the given path. + """ + if path: + for key in path.split("."): + json_schema = json_schema["properties"][key] + return json_schema + + +def resolve_data_object_path(data: Mapping, path: str) -> Any: + """Given a mapping, resolve the dot-separated path to a property. Return the + property value. + + Args: + data: + The JSON object. + path: + The dot-separated path to the property. + + Raises: + KeyError: + If the path does not exist in the data. + + Returns: The value of the property at the given path. + """ + if path: + for key in path.split("."): + data = data[key] + return data diff --git a/tests/fixtures/example_transformations/add_content_properties/multiple/config.yaml b/tests/fixtures/example_transformations/add_content_properties/multiple/config.yaml new file mode 100644 index 0000000..ce4e87c --- /dev/null +++ b/tests/fixtures/example_transformations/add_content_properties/multiple/config.yaml @@ -0,0 +1,22 @@ +add_content_properties: +- class_name: File + target_content: + object_path: "" + property_name: "additional_information" + required: false +- class_name: File + target_content: + object_path: "additional_information" + property_name: "compression" + content_schema: + type: "boolean" + value: false + # required is not set, so it defaults to false +- class_name: File + target_content: + object_path: "" + property_name: "content_description" + content_schema: + type: "string" + required: true + value: "" diff --git a/tests/fixtures/example_transformations/add_content_properties/multiple/transformed.datapack.yaml b/tests/fixtures/example_transformations/add_content_properties/multiple/transformed.datapack.yaml new file mode 100644 index 0000000..4e110b6 --- /dev/null +++ b/tests/fixtures/example_transformations/add_content_properties/multiple/transformed.datapack.yaml @@ -0,0 +1,59 @@ +datapack: 0.3.0 +resources: + File: + file_a: + content: + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + additional_information: # <- + compression: false + content_description: "" # <- + file_b: + content: + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + additional_information: # <- + compression: false + content_description: "" # <- + file_c: + content: + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + additional_information: # <- + compression: false + content_description: "" # <- + Dataset: + dataset_1: + content: + dac_contact: dac@example.org + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + description: Some sample. + relations: + files: + - file_a + - file_b + sample_y: + content: {} + relations: + files: + - file_c + Experiment: + experiment_i: + content: {} + relations: + samples: + - sample_x + - sample_y diff --git a/tests/fixtures/example_transformations/add_content_properties/multiple/transformed.schemapack.yaml b/tests/fixtures/example_transformations/add_content_properties/multiple/transformed.schemapack.yaml new file mode 100644 index 0000000..2a9ada0 --- /dev/null +++ b/tests/fixtures/example_transformations/add_content_properties/multiple/transformed.schemapack.yaml @@ -0,0 +1,86 @@ +schemapack: 0.3.0 +classes: + File: + id: + propertyName: alias + content: { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A file is an object that contains information generated from a process, either an Experiment or an Analysis.", + "properties": { + "checksum": { + "type": "string" + }, + "filename": { + "type": "string" + }, + "format": { + "type": "string" + }, + "size": { + "type": "integer" + }, + "additional_information": { # <- + "type": "object", + "additionalProperties": false, + "properties": { + "compression": { + "type": "boolean" + } + }, + "required": [ + "compression" + ], + }, + "content_description": { + "type" : "string" + } + }, + "required": [ + "filename", + "format", + "checksum", + "size", + "content_description", + ], + "type": "object" + } + Dataset: + id: + propertyName: alias + content: ../../../example_content_schemas/Dataset.schema.json + relations: + files: + targetClass: File + multiple: + origin: true + target: true + mandatory: + origin: false + target: true + Sample: + id: + propertyName: alias + content: ../../../example_content_schemas/Sample.schema.json + relations: + files: + targetClass: File + multiple: + origin: false + target: true + mandatory: + origin: false + target: true + Experiment: + id: + propertyName: alias + content: ../../../example_content_schemas/Experiment.schema.json + relations: + samples: + targetClass: Sample + multiple: + origin: false + target: true + mandatory: + origin: true + target: true diff --git a/tests/fixtures/transformations.py b/tests/fixtures/transformations.py index ae56376..35e6e37 100644 --- a/tests/fixtures/transformations.py +++ b/tests/fixtures/transformations.py @@ -23,6 +23,9 @@ from schemapack.spec.datapack import DataPack from schemapack.spec.schemapack import SchemaPack +from metldata.builtin_transformations.add_content_properties.main import ( + ADD_CONTENT_PROPERTIES_TRANSFORMATION, +) from metldata.builtin_transformations.delete_properties.main import ( PROPERTY_DELETION_TRANSFORMATION, ) @@ -39,6 +42,7 @@ TRANSFORMATIONS_BY_NAME: dict[str, TransformationDefinition] = { "infer_relations": RELATION_INFERENCE_TRANSFORMATION, "delete_properties": PROPERTY_DELETION_TRANSFORMATION, + "add_content_properties": ADD_CONTENT_PROPERTIES_TRANSFORMATION, }