Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement "Add content property" transformation (GSI-729) #68

Merged
merged 10 commits into from
May 15, 2024
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Check model assumptions for the add content properties transformation."""

from schemapack.spec.schemapack import SchemaPack

from metldata.builtin_transformations.add_content_properties.instruction import (
AddContentPropertyInstruction,
)
from metldata.builtin_transformations.add_content_properties.path import (
resolve_schema_object_path,
)
from metldata.transform.base import ModelAssumptionError


def check_model_assumptions(
schema: SchemaPack,
instructions_by_class: dict[str, list[AddContentPropertyInstruction]],
) -> None:
"""Check the model assumptions for the add content properties transformation."""
for class_name, instructions in instructions_by_class.items():
class_def = schema.classes.get(class_name)

# Check if the class exists in the model
if not class_def:
raise ModelAssumptionError(
f"Class {class_name} does not exist in the model."
)

for instruction in instructions:
# Check if the property already exists in the target schema
try:
target_schema = resolve_schema_object_path(
json_schema=class_def.content.json_schema_dict,
path=instruction.target_content.object_path,
)
except KeyError:
continue
if instruction.target_content.property_name in target_schema.get(
"properties", {}
):
raise ModelAssumptionError(
f"Property {instruction.target_content.property_name} already exists"
+ f" in class {class_name}."
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Models used to describe content properties that shall be deleted."""

from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

from metldata.builtin_transformations.add_content_properties.instruction import (
AddContentPropertyInstruction,
)


class AddContentPropertiesConfig(BaseSettings):
"""A Config for a transformation that adds a new property to an object within a
content schema
"""

model_config = SettingsConfigDict(extra="forbid")

add_content_properties: list[AddContentPropertyInstruction] = Field(
...,
description=(
"A list of instructions to add content properties to the model and data."
),
)

def instructions_by_class(
self,
) -> dict[str, list[AddContentPropertyInstruction]]:
"""Returns a dictionary of instructions by class."""
instructions_by_class: dict[str, list[AddContentPropertyInstruction]] = {}
for instruction in self.add_content_properties:
instructions_by_class.setdefault(instruction.class_name, []).append(
instruction
)
return instructions_by_class
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data transformation logic for the add content properties transformation."""

from copy import deepcopy

from schemapack.spec.datapack import DataPack

from metldata.builtin_transformations.add_content_properties.instruction import (
AddContentPropertyInstruction,
)
from metldata.builtin_transformations.add_content_properties.path import (
resolve_data_object_path,
)
from metldata.transform.base import EvitableTransformationError


def add_properties(
*,
data: DataPack,
instructions_by_class: dict[str, list[AddContentPropertyInstruction]],
) -> DataPack:
"""Given a data pack and a dictionary of instructions by class, add the specified
content properties to the data.

Args:
data:
The datapack to add the content properties to.
instructions_by_class:
A dictionary mapping class names to lists of instructions.

Returns:
The data with the specified content properties being added.
"""
modified_data = data.model_copy(deep=True)

for class_name, instructions in instructions_by_class.items():
resources = modified_data.resources.get(class_name)

if not resources:
raise EvitableTransformationError()

for resource in resources.values():
for instruction in instructions:
content = resource.content
object = resolve_data_object_path(
data=content,
path=instruction.target_content.object_path,
)

if (
not isinstance(object, dict)
or instruction.target_content.property_name in object
):
raise EvitableTransformationError()

object[instruction.target_content.property_name] = deepcopy(
instruction.value
)

return modified_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Models for instructions used in the 'add content properties' transformation."""

from typing import Any, Final

from pydantic import Field
from pydantic_settings import BaseSettings

DEFAULT_CONTENT_SCHEMA: Final[dict[str, Any]] = {
"type": "object",
"additionalProperties": False,
}


class NewContentSchemaPath(BaseSettings):
"""A model describing the path of an object property within the content schema that
is yet to be added. The model comprises a path to an already existing object within
the content schema and the name of a property to be added to that object's schema
"""

object_path: str = Field(
...,
description=(
"The path to the content object to which a property shall be added. The"
+ " path must be specified in dot notation, equivalently to JavaScript"
+ " property accessors."
),
examples=["some_property.another_nested_property"],
)

property_name: str = Field(..., description="The name of the property to be added.")


class AddContentPropertyInstruction(BaseSettings):
"""A model describing an instruction to add a new content property to a class in a
schemapack, including an associated default value in corresponding data.
"""

class_name: str = Field(..., description="The name of the class to modify.")

target_content: NewContentSchemaPath

required: bool = Field(
True,
description=(
"Indicates whether the newly added property shall be added to the"
+ " 'required' list of the corresponding object. Defaults to 'True'."
),
)

content_schema: dict[str, Any] = Field(
DEFAULT_CONTENT_SCHEMA,
description="The JSON schema of the newly added property.",
)

value: Any = Field(
{}, description="A value to assign to the new property in the data."
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""A transformation to add content properties."""

from schemapack.spec.datapack import DataPack
from schemapack.spec.schemapack import SchemaPack

from metldata.builtin_transformations.add_content_properties.assumptions import (
check_model_assumptions,
)
from metldata.builtin_transformations.add_content_properties.config import (
AddContentPropertiesConfig,
)
from metldata.builtin_transformations.add_content_properties.data_transform import (
add_properties,
)
from metldata.builtin_transformations.add_content_properties.model_transform import (
add_content_properties,
)
from metldata.transform.base import (
DataTransformer,
TransformationDefinition,
)


class AddContentPropertiesTransformer(DataTransformer[AddContentPropertiesConfig]):
"""A transformer that deletes content properties from data."""

def transform(self, data: DataPack) -> DataPack:
"""Transforms data.

Args:
data: The data as DataPack to be transformed.
"""
return add_properties(
data=data, instructions_by_class=self._config.instructions_by_class()
)


def check_model_assumptions_wrapper(
model: SchemaPack,
config: AddContentPropertiesConfig,
) -> None:
"""Check the assumptions of the model.

Raises:
ModelAssumptionError:
if the model does not fulfill the assumptions.
"""
check_model_assumptions(
schema=model, instructions_by_class=config.instructions_by_class()
)


def transform_model(
model: SchemaPack, config: AddContentPropertiesConfig
) -> SchemaPack:
"""Transform the data model."""
return add_content_properties(
model=model, instructions_by_class=config.instructions_by_class()
)


ADD_CONTENT_PROPERTIES_TRANSFORMATION = TransformationDefinition[
AddContentPropertiesConfig
](
config_cls=AddContentPropertiesConfig,
check_model_assumptions=check_model_assumptions_wrapper,
transform_model=transform_model,
data_transformer_factory=AddContentPropertiesTransformer,
)
Loading