diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/__init__.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/__init__.py new file mode 100644 index 00000000..b4c1771d --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""A transformation to infer references based on existing ones in the metadata model.""" + + +# shortcuts: +from metldata.schemapack_.builtin_transformations.infer_relations.main import ( # noqa: F401 + RELATION_INFERENCE_TRANSFORMATION, +) diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/assumptions.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/assumptions.py new file mode 100644 index 00000000..5cf507cf --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/assumptions.py @@ -0,0 +1,99 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Check model assumptions.""" + +from schemapack.spec.schemapack import SchemaPack + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path import ( + RelationPath, +) +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_elements import ( + RelationPathElementType, +) +from metldata.schemapack_.builtin_transformations.infer_relations.relations import ( + InferenceInstruction, +) +from metldata.schemapack_.transform.base import ModelAssumptionError + + +def assert_path_classes_and_relations(model: SchemaPack, path: RelationPath): + """Make sure that all classes and relations defined in the provided path exist in + the provided model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + for path_element in path.elements: + if path_element.source not in model.classes: + raise ModelAssumptionError( + f"Class {path_element.source} not found in model." + ) + + if path_element.target not in model.classes: + raise ModelAssumptionError( + f"Class {path_element.target} not found in model." + ) + + if path_element.type_ == RelationPathElementType.ACTIVE: + if ( + path_element.property + not in model.classes[path_element.source].relations + ): + raise ModelAssumptionError( + f"Relation property {path_element.property} not found in class" + f" {path_element.source}." + ) + + return + + if path_element.property not in model.classes[path_element.target].relations: + raise ModelAssumptionError( + f"Relation property {path_element.property} not found in class" + f" {path_element.target}." + ) + + +def assert_new_property_not_exists( + model: SchemaPack, instruction: InferenceInstruction +) -> None: + """Make sure that new property specified in the instruction does not yet exist in + the model. The existence of the source class is not checked. + """ + source_class = model.classes.get(instruction.source) + if source_class and instruction.new_property in source_class.relations: + raise ModelAssumptionError( + f"Property '{instruction.new_property}' of class '{instruction.source}'" + + ", intended to store an inferred relation, does already exist." + ) + + +def assert_instructions_match_model( + *, + model: SchemaPack, + instructions: list[InferenceInstruction], +) -> None: + """Make sure that the provided inference instructions can be applied to the provided + model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + for instruction in instructions: + assert_path_classes_and_relations(model=model, path=instruction.path) + assert_new_property_not_exists(model=model, instruction=instruction) diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/config.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/config.py new file mode 100644 index 00000000..09fa4bd6 --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/config.py @@ -0,0 +1,79 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Models used to describe all inferred relations based on existing relations.""" + +from functools import cached_property + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + +from metldata.schemapack_.builtin_transformations.infer_relations.relations import ( + InferenceInstruction, + RelationDetails, +) + + +class RelationInferenceConfig(BaseSettings): + """Config containing instructions to infer relations for all classes of a model.""" + + model_config = SettingsConfigDict(extra="forbid") + + inferred_relations: dict[str, dict[str, RelationDetails]] = Field( + ..., + description=( + "A nested dictionary describing instructions to infer relations based" + + " on existing relations. On the first level keys refer to classes to" + + " which the inferred relations should be added. On the second level, keys" + + " refer to the names of the new property of the host class that hold the" + + " inferred relation. The values refer to the actual relation details." + ), + examples=[ + { + "ClassA": { + "class_d": { + "path": "ClassA(class_b)>ClassB(class_d)>ClassD", + "cardinality": "many_to_many", + }, + "class_c": { + "path": "ClassA(class_b)>ClassB<(class_c)ClassC", + "cardinality": "many_to_one", + }, + }, + "ClassB": { + "class_c": { + "path": "ClassB<(class_c)ClassC", + "cardinality": "many_to_many", + } + }, + } + ], + ) + + @cached_property + def inference_instructions(self) -> list[InferenceInstruction]: + """A list of inferred relations.""" + return [ + InferenceInstruction( + source=source, + target=relation_details.path.target, + path=relation_details.path, + new_property=property_name, + allow_multiple=relation_details.allow_multiple, + ) + for source, slot_description in self.inferred_relations.items() + for property_name, relation_details in slot_description.items() + ] diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/data_transform.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/data_transform.py new file mode 100644 index 00000000..48070dc0 --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/data_transform.py @@ -0,0 +1,242 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Logic for transforming data. + +Here is a brief summary of the principle steps of transformation: +- iterate over inferred relations list from the config, per inferred relation: + - extract the resources of the host class + - iterate over host resources, per host resource: + - iterate over path elements + - iterate over source resources (for the first path element the host + resource serves as the single source), per source resource: + - resolve the path element for the source resource: + - if active reference: + - lookup target resources specified in the relation property + defined in the path element + - if passive reference: + - iterate over resources of the target class, per potential + target resource: + - if the resource references the source resource via the + relation property defined in the path element, add it to + the target resources of the path element in context of + the given source resource + - collect the target resources for all source resources of the given path + element + - use the target resources of this iteration as the source resources for the + next one + - the target resources of the last path element are the target resources + of the entire inferred relation for the given host resource + - add the target resources to the host resource as a new relation property + as defined in the inferred relation +""" + +from schemapack.spec.datapack import DataPack, Resource, ResourceId + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path import ( + RelationPath, +) +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_elements import ( + RelationPathElement, + RelationPathElementType, +) +from metldata.schemapack_.builtin_transformations.infer_relations.relations import ( + InferenceInstruction, +) +from metldata.schemapack_.transform.base import EvitableTransformationError + + +def resolve_active_path_element( + *, + data: DataPack, + source_resource_id: ResourceId, + path_element: RelationPathElement, +) -> set[ResourceId]: + """Resolve the given relation inference path element of active type for the given + source resource. + + Args: + data: + The data pack to look up resources in. + source_resource_id: + The id of the resource to which the path element is applied. + path_element: + The relation inference path element to resolve. It is assumed to be of + active type. + + Returns: + A set of resource IDs that are targeted by the path element in context of the + given source resource. + """ + if path_element.type_ != RelationPathElementType.ACTIVE: + raise ValueError( + "Expected path element of type 'ACTIVE', but got a 'PASSIVE' one." + ) + + source_resource = data.resources.get(path_element.source, {}).get( + source_resource_id + ) + + if not source_resource: + raise EvitableTransformationError() + + target_resource_ids = source_resource.relations.get(path_element.property, []) + return ( + set(target_resource_ids) + if isinstance(target_resource_ids, list) + else {target_resource_ids} + ) + + +def resolve_passive_path_element( + *, + data: DataPack, + source_resource_id: ResourceId, + path_element: RelationPathElement, +) -> set[ResourceId]: + """Resolve the given relation inference path element of passive type for the given + source resource. + + Args: + data: + The data pack to look up resources in. + source_resource_id: + The id of the resource to which the path element is applied. + path_element: + The relation inference path element to resolve. It is assumed to be of + passive type. + + Returns: + A set of resource IDs that are targeted by the path element in context of the + given source resource. + """ + if path_element.type_ != RelationPathElementType.PASSIVE: + raise ValueError( + "Expected path element of type 'PASSIVE', but got an 'ACTIVE' one." + ) + + candidate_resources = data.resources.get(path_element.target, {}) + target_resource_ids = set() + + for candidate_resource_id, candidate_resource in candidate_resources.items(): + relation = candidate_resource.relations.get(path_element.property, []) + + if ( + isinstance(relation, list) and source_resource_id in relation + ) or source_resource_id == relation: + target_resource_ids.add(candidate_resource_id) + + return target_resource_ids + + +def resolve_path_element( + *, + data: DataPack, + source_resource_id: ResourceId, + path_element: RelationPathElement, +) -> set[ResourceId]: + """Resolve the given relation inference path element for the given source resource. + + Args: + data: The data pack to look up resources in. + source_resource_id: The id of the resource to which the path element is applied. + path_element: The relation inference path element to resolve. + + Returns: + A set of resource IDs that are targeted by the path element in context of the + given source resource. + """ + + resolve = ( + resolve_active_path_element + if path_element.type_ == RelationPathElementType.ACTIVE + else resolve_passive_path_element + ) + return resolve( + data=data, + source_resource_id=source_resource_id, + path_element=path_element, + ) + + +def resolve_path( + *, data: DataPack, source_resource_id: ResourceId, path: RelationPath +) -> set[ResourceId]: + """Resolve the given relation inference path for the given source resource. + + Args: + data: The data pack to look up resources in. + source_resource_id: The id of the resource to which the path is applied. + path: The relation inference path to resolve. + + Returns: + A set of resource IDs that are targeted by the path in context of the given + source resource. + """ + + resource_ids: set[ResourceId] = {source_resource_id} + for path_element in path.elements: + # the target resources of the current iteration are the source resources of the + # next iteration: + resource_ids = { + target_resource_id + for source_resource_id in resource_ids + for target_resource_id in resolve_path_element( + data=data, + source_resource_id=source_resource_id, + path_element=path_element, + ) + } + + return resource_ids + + +def add_inferred_relations( + *, data: DataPack, instructions: list[InferenceInstruction] +) -> DataPack: + """Adds inferred relations to the given data as per the given instructions.""" + for instruction in instructions: + host_resources = data.resources.get(instruction.source, {}) + updated_host_resources: dict[ResourceId, Resource] = {} + + for host_resource_id, host_resource in host_resources.items(): + target_resource_ids = resolve_path( + data=data, + source_resource_id=host_resource_id, + path=instruction.path, + ) + # transform into list (as references are stored as such) and make order + # deterministic: + target_list = sorted(target_resource_ids) + updated_host_resources[host_resource_id] = host_resource.model_copy( + update={ + "relations": { + **host_resource.relations, + instruction.new_property: target_list, + } + } + ) + + data = data.model_copy( + update={ + "resources": { + **data.resources, + instruction.source: updated_host_resources, + } + } + ) + + return data diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/main.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/main.py new file mode 100644 index 00000000..8f6199db --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/main.py @@ -0,0 +1,88 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""A transformation to infer references based on existing ones in the data model.""" + +from schemapack.spec.datapack import DataPack +from schemapack.spec.schemapack import SchemaPack + +from metldata.schemapack_.builtin_transformations.infer_relations import ( + data_transform, + model_transform, +) +from metldata.schemapack_.builtin_transformations.infer_relations.assumptions import ( + assert_instructions_match_model, +) +from metldata.schemapack_.builtin_transformations.infer_relations.config import ( + RelationInferenceConfig, +) +from metldata.schemapack_.transform.base import ( + DataTransformer, + TransformationDefinition, +) + + +class RelationInferenceDataTransformer(DataTransformer[RelationInferenceConfig]): + """A transformer that infers relation in data based on existing ones.""" + + def transform(self, data: DataPack) -> DataPack: + """Transforms data. + + Args: + data: The data as DataPack to be transformed. + + Raises: + DataTransformationError: + if the transformation fails. + """ + return data_transform.add_inferred_relations( + data=data, instructions=self._config.inference_instructions + ) + + +def check_model_assumptions( + model: SchemaPack, + config: RelationInferenceConfig, +) -> None: + """Check the assumptions of the model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + assert_instructions_match_model( + model=model, instructions=config.inference_instructions + ) + + +def transform_model(model: SchemaPack, config: RelationInferenceConfig) -> SchemaPack: + """Transform the data model. + + Raises: + DataModelTransformationError: + if the transformation fails. + """ + return model_transform.add_inferred_relations( + model=model, instructions=config.inference_instructions + ) + + +RELATION_INFERENCE_TRANSFORMATION = TransformationDefinition[RelationInferenceConfig]( + config_cls=RelationInferenceConfig, + check_model_assumptions=check_model_assumptions, + transform_model=transform_model, + data_transformer_factory=RelationInferenceDataTransformer, +) diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/model_transform.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/model_transform.py new file mode 100644 index 00000000..00897cf0 --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/model_transform.py @@ -0,0 +1,78 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Logic for transforming metadata models.""" + +from schemapack.spec.schemapack import ( + Cardinality, + ClassDefinition, + Relation, + SchemaPack, +) +from schemapack.utils import FrozenDict + +from metldata.schemapack_.builtin_transformations.infer_relations.relations import ( + InferenceInstruction, +) +from metldata.schemapack_.transform.base import EvitableTransformationError + + +def add_inferred_relations( + *, model: SchemaPack, instructions: list[InferenceInstruction] +) -> SchemaPack: + """Add inferred relations to a model. + + Args: + model: The model based on SchemaPack to add the inferred relations to. + instructions: The instructions for inferring relations. + + Returns: + The model with the inferred relations added. + """ + + updated_class_defs: dict[str, ClassDefinition] = {} + for instruction in instructions: + class_def = ( + updated_class_defs[instruction.source] + if instruction.source in updated_class_defs + else model.classes.get(instruction.source) + ) + + if class_def is None: + raise EvitableTransformationError() + + new_relation = Relation.model_validate( + { + "to": instruction.target, + "cardinality": Cardinality.MANY_TO_MANY + if instruction.allow_multiple + else Cardinality.ONE_TO_MANY, + } + ) + updated_class_defs[instruction.source] = ClassDefinition.model_validate( + { + "id": class_def.id, + "content": class_def.content, + "relations": { + **class_def.relations, + instruction.new_property: new_relation, + }, + } + ) + + return model.model_copy( + update={"classes": FrozenDict({**model.classes, **updated_class_defs})} + ) diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/path/__init__.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/__init__.py new file mode 100644 index 00000000..c813d562 --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Utils for dealing with references between classes of the metadata model.""" diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path.py new file mode 100644 index 00000000..4a7227eb --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path.py @@ -0,0 +1,119 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Logic for handling relation paths.""" + +from pydantic import GetJsonSchemaHandler, ValidationInfo + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_str import ( + PATH_PATTERN, + ValidationError, + clean_path_str, + path_str_to_object_elements, +) + + +class RelationPath: + """A model describing the path of a relation between classes of a metadata model. + + The relation path has two available representation. A string-based ("path_str" + attribute) and an element-based ("elements" attribute) one. + + In the string-based representation ("path_str" attribute), the first and the last + word correspond to the name of the source and target class respectively. ">" and "<" + means active (left class refers the right one) and passive (the left + class is referenced by the right one). Parentheses attached to these angles thereby + indicate the property name of the referencing class. E.g. "class_a(class_b)>class_b" + means that the source class "class_a" has a property "class_b" that references the + target class "class_b". Or "class_a<(class_a)class_b" means that the source + class "class_a" is referenced by the target class "class_b" via its properties + "class_a". Reference paths can also involve additional classes. E.g. a string of + "class_a<(class_a)class_b(class_c)>class_c" means that + a relation from the source class "class_a" to the target class "class_c" can be + established via an additional class "class_b". Any inserted spaces or newlines will + be ignored. So the following paths are equivalent: + - "class_a (class_b)> class_b" + - "class_a + (class_b)> + class_b" + + A relation path consists of one or more elements. An element is a relationship + between two classes. Reference paths that establish a direct relationship between + source and target classes without the use of additional classes have only one + element (e.g. in string representations "class_a(class_b)>class_b" or + "class_a<(class_a)class_b"). More complex paths consist of multiple elements. + E.g. the path "class_a<(class_a)class_b(class_c)>class_c" can be decomposed + into the elements: "class_a<(class_a)class_b" and + "class_b(class_c)>class_c". + + The elements of a RelationPath are stored in the "elements" attribute as a list + of RelationPathElement objects that are optimized for programmatic use. + + The "source" attribute provides the source class of the path while the + "target" attribute provides the target class of the path. + """ + + def __init__(self, *, path_str: str): + """Construct relation path from a string-based representation.""" + self.path_str = clean_path_str(path_str=path_str) + self.elements = path_str_to_object_elements(path_str=self.path_str) + self.source = self.elements[0].source + self.target = self.elements[-1].target + + @classmethod + def validate(cls, value, info: ValidationInfo) -> "RelationPath": + """A validator for pydantic.""" + if isinstance(value, cls): + return value + + if not isinstance(value, str): + raise ValueError("A string is required.") + + try: + return cls(path_str=value) + except ValidationError as error: + raise ValueError(str(error)) from ValidationError + + @classmethod + def __get_validators__(cls): + """To get validators for pydantic""" + yield cls.validate + + @classmethod + def __get_pydantic_json_schema__( + cls, field_schema: dict, handler: GetJsonSchemaHandler + ): + """Modify the field schema for pydantic.""" + field_schema.update(type="string", pattern=PATH_PATTERN) + + def __hash__(self): + """Calculate a hash.""" + return hash(self.path_str) + + def __eq__(self, other: object): + """For comparisons.""" + if not isinstance(other, RelationPath): + return NotImplemented + + return self.path_str == other.path_str + + def __repr__(self): + """For representation.""" + return f"RelationPath(path_str='{self.path_str}')" + + def __str__(self): + """For string representation.""" + return self.path_str diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path_elements.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path_elements.py new file mode 100644 index 00000000..a56cbf4b --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path_elements.py @@ -0,0 +1,62 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Data models""" + +from enum import Enum + +from pydantic import BaseModel, Field + + +class RelationPathElementType(Enum): + """The type of RelationPathElements. + + Can be active, meaning the source class is referencing the target class using the + specified slot. + Or passive, meaning that the source class is referenced by the target class and the + slot is part of the target class. + """ + + ACTIVE = "active" + PASSIVE = "passive" + + +class RelationPathElement(BaseModel): + """A model describing an element of a relation path between classes of a + metadata model as further explained by the RelationPath. + """ + + type_: RelationPathElementType = Field( + ..., + description=( + "The type of relation. Active or passive as explained in the" + + " RelationPathElementType enum." + ), + ) + source: str = Field( + ..., description="The name of the source class that is referencing." + ) + target: str = Field( + ..., description="The name of the target class that is referenced." + ) + property: str = Field( + ..., + description=( + "The name of the property that holds the relation." + + " In case of a active type, the property is part of the source class." + + " In case of a passive type, the property is part of the target class." + ), + ) diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path_str.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path_str.py new file mode 100644 index 00000000..c020bba8 --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/path/path_str.py @@ -0,0 +1,192 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Data models""" + +import re +from typing import Optional + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_elements import ( + RelationPathElement, + RelationPathElementType, +) + +NAME_PATTERN = r"(?!\d)\w+" +ACTIVE_ARROW_PATTERN = rf"\({NAME_PATTERN}\)>" +PASSIVE_ARROW_PATTERN = rf"<\({NAME_PATTERN}\)" +ARROW_PATTERN = rf"(({ACTIVE_ARROW_PATTERN})|({PASSIVE_ARROW_PATTERN}))" +ELEMENT_PATTERN = rf"{NAME_PATTERN}{ARROW_PATTERN}{NAME_PATTERN}" +PATH_RAW_CHAR_PATTERN = r"^[\w><\(\)]+$" +PATH_PATTERN = ( + rf"^{NAME_PATTERN}{ARROW_PATTERN}({NAME_PATTERN}{ARROW_PATTERN})*{NAME_PATTERN}$" +) + + +class ValidationError(RuntimeError): + """Raised when a path string was invalid""" + + +def clean_path_str(path_str: str) -> str: + """Cleanup whitespaces, newlines, etc.""" + return "".join(path_str.split()) + + +def validate_path_str_characters(path_str: str) -> None: + """Validates the characters of the an uncleaned path str. The path_str is assumed to + be cleaned. + + Raises: + ValidationError: if invalid. + """ + if not re.match(PATH_RAW_CHAR_PATTERN, path_str): + raise ValidationError( + f"The following path string contains invalid characters: {path_str}" + ) + + +def validate_path_str_format(path_str: str) -> None: + """Validates the format of the path str. The path_str is assumed to be cleaned. + + Raises: + ValidationError: if invalid. + """ + if not re.match(PATH_PATTERN, path_str): + raise ValidationError( + f"The following path string has an invalid format: {path_str}" + ) + + +def validate_string_element(string_element: str) -> None: + """Validates the format of a string-based path element. The path_str is assumed to + be cleaned. + + Raises: + ValidationError: if invalid. + """ + if not re.match(rf"^{ELEMENT_PATTERN}$", string_element): + raise ValidationError( + "The following string-based path element has an invalid format: " + + string_element + ) + + +def extract_first_element(*, path_str: str) -> str: + """Extract the first element of a path_str. The path_str is assumed to be cleaned. + + Raises: + ValidationError: if no element can be extracted. + """ + match = re.match(rf"^({ELEMENT_PATTERN}).*$", path_str) + + if not match: + raise ValidationError(f"Cannot find element in path string: {path_str}") + + return match.group(1) + + +def get_target_class(*, path_str: str) -> str: + """Get the target class of a path str. The path_str is assumed to be cleaned.""" + match = re.match(rf"^.*?({NAME_PATTERN})$", path_str) + + if not match: + raise ValidationError(f"Cannot find target class of path string: {path_str}") + + return match.group(1) + + +def split_first_element(*, path_str: str) -> tuple[str, Optional[str]]: + """Return a tuple of the first element and the remaining path string. + Thereby, the target class of the first element is set as the source class of the + remaining path. + The second element is None if the provided path only contained one element. + The path_str is assumed to be cleaned. + """ + first_element = extract_first_element(path_str=path_str) + first_element_target_class = get_target_class(path_str=first_element) + + if first_element == path_str: + return first_element, None + + remaining_path = path_str[len(first_element) :] + remaining_path_extended = first_element_target_class + remaining_path + + return first_element, remaining_path_extended + + +def get_string_elements(*, path_str: str) -> list[str]: + """Decomposes a path string into elements in string repesentation. The path_str is + assumed to be cleaned. + """ + elements: list[str] = [] + remaining_path = path_str + + # extract one element at a time: + while remaining_path: + element, remaining_path = split_first_element( # type: ignore + path_str=remaining_path + ) + elements.append(element) + + return elements + + +def get_element_type(*, string_element: str) -> RelationPathElementType: + """Infers the type of the provided string-based element.""" + validate_string_element(string_element) + + return ( + RelationPathElementType.ACTIVE + if ">" in string_element + else RelationPathElementType.PASSIVE + ) + + +def get_element_components(*, string_element: str) -> tuple[str, str, str]: + """Returns a tuple of the source, the slot, and the target of the string-based path + element. + """ + # remove the angle: + string_element_cleaned = string_element.replace(">", "").replace("<", "") + + # extract the source: + source, slot_and_target = string_element_cleaned.split("(") + + # extract slot and target: + slot, target = slot_and_target.split(")") + + return source, slot, target + + +def string_element_to_object(string_element: str) -> RelationPathElement: + """Translates a string-based path element into an object-based representation.""" + validate_string_element(string_element) + type_ = get_element_type(string_element=string_element) + source, slot, target = get_element_components(string_element=string_element) + + return RelationPathElement(type_=type_, source=source, property=slot, target=target) + + +def path_str_to_object_elements(path_str: str) -> list[RelationPathElement]: + """Translates a path string into a list of object-based elements. The path_str is + assumed to be cleaned. + """ + validate_path_str_characters(path_str=path_str) + validate_path_str_format(path_str=path_str) + + string_elements = get_string_elements(path_str=path_str) + return [ + string_element_to_object(string_element) for string_element in string_elements + ] diff --git a/src/metldata/schemapack_/builtin_transformations/infer_relations/relations.py b/src/metldata/schemapack_/builtin_transformations/infer_relations/relations.py new file mode 100644 index 00000000..d883e2c8 --- /dev/null +++ b/src/metldata/schemapack_/builtin_transformations/infer_relations/relations.py @@ -0,0 +1,82 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Modelling inferred relations.""" + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path import ( + RelationPath, +) + + +class RelationDetails(BaseModel): + """A base model for describing an inferred relation that is based on existing + relations. + """ + + path: RelationPath = Field( + ..., + description=( + "The path to reconstruct the new relation based on existing relations." + ), + ) + allow_multiple: bool = Field( + True, + description=( + "Whether multiple target resources to expect for this relation." + + " `True` corresponds to a `many_to_many` cardinality, `False` represents" + + " a `many_to_one` cardinality. `one_to_one` or `one_to_many`" + + " cardinalities are currently not possible for inferred relations." + ), + ) + + +class InferenceInstruction(RelationDetails): + """A model for describing an inferred relation that is based on existing + relations. + """ + + model_config = ConfigDict(frozen=True) + + source: str = Field( + ..., description="The source class to which this relation should be added." + ) + target: str = Field(..., description="The class targeted by this relation.") + new_property: str = Field( + ..., + description=( + "The name of the new property in the source to store the inferred relation." + ), + ) + + @model_validator(mode="after") + @classmethod + def validate_source_and_target(cls, values): + """Validate that the source and target attributes are identical with the + source and target specified in the path. + """ + if values.source != values.path.source: + raise ValueError( + "The source is not identical with the source of the specified path." + ) + + if values.target != values.path.target: + raise ValueError( + "The target is not identical with the target of the specified path." + ) + + return values diff --git a/src/metldata/schemapack_/builtin_transformations/null/__init__.py b/src/metldata/schemapack_/builtin_transformations/null/__init__.py index 598799bf..645091bf 100644 --- a/src/metldata/schemapack_/builtin_transformations/null/__init__.py +++ b/src/metldata/schemapack_/builtin_transformations/null/__init__.py @@ -17,6 +17,7 @@ """A Null transformer that returns the input model and data unchanged. Useful e.g. for testing.""" +# shortcuts: from metldata.schemapack_.builtin_transformations.null.main import ( # noqa: F401 NULL_TRANSFORMATION, ) diff --git a/src/metldata/schemapack_/transform/base.py b/src/metldata/schemapack_/transform/base.py index 11fe0353..fc7aac24 100644 --- a/src/metldata/schemapack_/transform/base.py +++ b/src/metldata/schemapack_/transform/base.py @@ -39,11 +39,31 @@ class ModelAssumptionError(RuntimeError): class ModelTransformationError(RuntimeError): - """Raised when a transformation failed when applied to the schemapack-based model.""" + """Raised when a transformation failed when applied to the schemapack-based model. + This exception should only be raised when the error could not have been caught + earlier by model assumption checks (otherwise the AssumptionsInsufficiencyError + should be raised instead).""" class DataTransformationError(RuntimeError): - """Raised when a transformation failed when applied to data in datapack-format.""" + """Raised when a transformation failed when applied to data in datapack-format. + This exception should only be raised when the error could not have been caught + earlier by model assumption checks (otherwise the EvitableTransformationError + should be raised instead).""" + + +class EvitableTransformationError(RuntimeError): + """Raised when an exception during the model or data transformation should have + been caught earlier by model assumption or data validation checks.""" + + def __init__(self): + super().__init__( + "This unexpected error appeared during transformation, however, it should" + + " have been caught earlier during model assumption checks (and/or by data" + + " validation against the assumption-checked model). Please make sure that" + + " the model assumption checks guarantee the workability of the" + + " corresponding transformation wrt the provided model (and/or data)." + ) Config = TypeVar("Config", bound=BaseModel) @@ -56,14 +76,14 @@ def __init__( self, *, config: Config, - original_model: SchemaPack, + input_model: SchemaPack, transformed_model: SchemaPack, ): - """Initialize the transformer with config params, the original model, and the + """Initialize the transformer with config params, the input model, and the transformed model. """ self._config = config - self._original_model = original_model + self._input_model = input_model self._transformed_model = transformed_model @abstractmethod diff --git a/src/metldata/schemapack_/transform/handling.py b/src/metldata/schemapack_/transform/handling.py index 2d1a5b7b..b411d4e5 100644 --- a/src/metldata/schemapack_/transform/handling.py +++ b/src/metldata/schemapack_/transform/handling.py @@ -16,6 +16,7 @@ """Logic for handling Transformation.""" +import schemapack.exceptions from pydantic import BaseModel, ConfigDict from schemapack.spec.datapack import DataPack from schemapack.spec.schemapack import SchemaPack @@ -31,6 +32,30 @@ ) +class PreTransformValidationError(RuntimeError): + """Raised when the validation of input data fails against the input model at the + beginning of a data transformation.""" + + def __init__(self, *, validation_error: schemapack.exceptions.ValidationError): + """Initialize with the schemapack ValidationError.""" + super().__init__( + "Validation of input data failed against the input model:" + + f"\n{validation_error}" + ) + + +class PostTransformValidationError(RuntimeError): + """Raised when the validation of transformed data fails against the transformed + model at the end of a data transformation step.""" + + def __init__(self, *, validation_error: schemapack.exceptions.ValidationError): + """Initialize with the schemapack ValidationError.""" + super().__init__( + "Validation of transformed data failed against the transformed model:" + + f"\n{validation_error}" + ) + + class WorkflowConfigMismatchError(RuntimeError): """Raised when the provided workflow config does not match the config class of the workflow definition. @@ -55,56 +80,63 @@ def __init__( self, transformation_definition: TransformationDefinition[Config], transformation_config: Config, - original_model: SchemaPack, + input_model: SchemaPack, ): """Initialize the TransformationHandler by checking the assumptions made on the - original model and transforming the model as described in the transformation + input model and transforming the model as described in the transformation definition. The transformed model is available at the `transformed_model` attribute. Raises: ModelAssumptionError: - if the assumptions made on the original model are not met. + if the assumptions made on the input model are not met. """ self._definition = transformation_definition self._config = transformation_config - self._original_model = original_model + self._input_model = input_model - self._definition.check_model_assumptions(self._original_model, self._config) + self._definition.check_model_assumptions(self._input_model, self._config) self.transformed_model = self._definition.transform_model( - self._original_model, self._config + self._input_model, self._config ) self._data_transformer = self._definition.data_transformer_factory( config=self._config, - original_model=self._original_model, + input_model=self._input_model, transformed_model=self.transformed_model, ) - self._original_data_validator = SchemaPackValidator( - schemapack=self._original_model - ) + self._input_data_validator = SchemaPackValidator(schemapack=self._input_model) self._transformed_data_validator = SchemaPackValidator( schemapack=self.transformed_model ) def transform_data(self, data: DataPack) -> DataPack: """Transforms data using the transformation definition. Validates the - original data against the original model and the transformed data + input data against the input model and the transformed data against the transformed model. Args: data: The data to be transformed. Raises: - schemapack.exceptions.ValidationError: - If validation of input data or transformed data fails against the - original or transformed model, respectively. + PreTransformValidation: + If validation of input data fails against the input model. + PostTransformValidation: + If validation of transformed data fails against the transformed model. DataTransformationError: if the transformation fails. """ - self._original_data_validator.validate(datapack=data) + try: + self._input_data_validator.validate(datapack=data) + except schemapack.exceptions.ValidationError as error: + raise PreTransformValidationError(validation_error=error) from error + transformed_data = self._data_transformer.transform(data=data) - self._transformed_data_validator.validate(datapack=transformed_data) + + try: + self._transformed_data_validator.validate(datapack=transformed_data) + except schemapack.exceptions.ValidationError as error: + raise PostTransformValidationError(validation_error=error) from error return transformed_data @@ -144,7 +176,7 @@ def resolve_workflow_step( step_name: str, workflow_definition: WorkflowDefinition, workflow_config: WorkflowConfig, - original_model: SchemaPack, + input_model: SchemaPack, ) -> ResolvedWorkflowStep: """Translates a workflow step given a workflow definition and a workflow config into a resolved workflow step. @@ -157,7 +189,7 @@ def resolve_workflow_step( transformation_handler = TransformationHandler( transformation_definition=workflow_step.transformation_definition, transformation_config=transformation_config, - original_model=original_model, + input_model=input_model, ) return ResolvedWorkflowStep( transformation_handler=transformation_handler, @@ -168,7 +200,7 @@ def resolve_workflow_step( def resolve_workflow( workflow_definition: WorkflowDefinition, - original_model: SchemaPack, + input_model: SchemaPack, workflow_config: WorkflowConfig, ) -> ResolvedWorkflow: """Translates a workflow definition given an input model and a workflow config into @@ -182,7 +214,7 @@ def resolve_workflow( for step_name in workflow_definition.step_order: workflow_step = workflow_definition.steps[step_name] input_model = ( - original_model + input_model if workflow_step.input is None else resolved_steps[ workflow_step.input @@ -194,7 +226,7 @@ def resolve_workflow( step_name=step_name, workflow_definition=workflow_definition, workflow_config=workflow_config, - original_model=input_model, + input_model=input_model, ) return ResolvedWorkflow( @@ -222,7 +254,7 @@ def __init__( self, workflow_definition: WorkflowDefinition, workflow_config: WorkflowConfig, - original_model: SchemaPack, + input_model: SchemaPack, ): """Initialize the WorkflowHandler with a workflow deinition, a matching config, and a model. The workflow definition is translated into a @@ -230,7 +262,7 @@ def __init__( """ self._resolved_workflow = resolve_workflow( workflow_definition=workflow_definition, - original_model=original_model, + input_model=input_model, workflow_config=workflow_config, ) diff --git a/tests/schemapack_/builtin_transformations/__init__.py b/tests/schemapack_/builtin_transformations/__init__.py new file mode 100644 index 00000000..e01cef3b --- /dev/null +++ b/tests/schemapack_/builtin_transformations/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test the builtin_transformations sub-package.""" diff --git a/tests/schemapack_/builtin_transformations/infer_relations/__init__.py b/tests/schemapack_/builtin_transformations/infer_relations/__init__.py new file mode 100644 index 00000000..222a85e0 --- /dev/null +++ b/tests/schemapack_/builtin_transformations/infer_relations/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test the infer_relations sub-package.""" diff --git a/tests/schemapack_/builtin_transformations/infer_relations/path/__init__.py b/tests/schemapack_/builtin_transformations/infer_relations/path/__init__.py new file mode 100644 index 00000000..762a33f4 --- /dev/null +++ b/tests/schemapack_/builtin_transformations/infer_relations/path/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test the path sub-package.""" diff --git a/tests/schemapack_/builtin_transformations/infer_relations/path/test_config.py b/tests/schemapack_/builtin_transformations/infer_relations/path/test_config.py new file mode 100644 index 00000000..8eee530b --- /dev/null +++ b/tests/schemapack_/builtin_transformations/infer_relations/path/test_config.py @@ -0,0 +1,77 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test relations utils.""" + +from metldata.schemapack_.builtin_transformations.infer_relations.config import ( + RelationInferenceConfig, +) +from metldata.schemapack_.builtin_transformations.infer_relations.path.path import ( + RelationPath, +) +from metldata.schemapack_.builtin_transformations.infer_relations.relations import ( + InferenceInstruction, +) + + +def test_config(): + """Test the RelationInferenceConfig class.""" + + inferred_relations = { + "class_a": { + "class_d": { + "path": "class_a(class_b)>class_b(class_d)>class_d", + "allow_multiple": False, + }, + "class_c": { + "path": "class_a(class_b)>class_b<(class_c)class_c", + "allow_multiple": True, + }, + }, + "class_b": { + "class_c": { + "path": "class_b<(class_c)class_c", + "allow_multiple": True, + } + }, + } + expected_refs = [ + InferenceInstruction( + source="class_a", + target="class_d", + path=RelationPath(path_str="class_a(class_b)>class_b(class_d)>class_d"), + new_property="class_d", + allow_multiple=False, + ), + InferenceInstruction( + source="class_a", + target="class_c", + path=RelationPath(path_str="class_a(class_b)>class_b<(class_c)class_c"), + new_property="class_c", + allow_multiple=True, + ), + InferenceInstruction( + source="class_b", + target="class_c", + path=RelationPath(path_str="class_b<(class_c)class_c"), + new_property="class_c", + allow_multiple=True, + ), + ] + + config = RelationInferenceConfig(inferred_relations=inferred_relations) # type: ignore + observed_refs = config.inference_instructions + assert expected_refs == observed_refs diff --git a/tests/schemapack_/builtin_transformations/infer_relations/path/test_path.py b/tests/schemapack_/builtin_transformations/infer_relations/path/test_path.py new file mode 100644 index 00000000..a2b27c0a --- /dev/null +++ b/tests/schemapack_/builtin_transformations/infer_relations/path/test_path.py @@ -0,0 +1,155 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test the path module.""" + +from contextlib import nullcontext + +import pytest +from pydantic import BaseModel + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path import ( + RelationPath, +) +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_elements import ( + RelationPathElement, + RelationPathElementType, +) + + +@pytest.mark.parametrize( + "path_str, expected_elements, expected_source, expected_target", + [ + ( + "class_a(class_b)>class_b", + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ) + ], + "class_a", + "class_b", + ), + ( + """class_a + (class_b) > + class_b""", # containing whitespaces + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ) + ], + "class_a", + "class_b", + ), + ( + "class_a<(class_a)class_b", + [ + RelationPathElement( + type_=RelationPathElementType.PASSIVE, + source="class_a", + property="class_a", + target="class_b", + ) + ], + "class_a", + "class_b", + ), + ( + "class_a(class_b)>class_b(class_c)>class_c", + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ), + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_b", + property="class_c", + target="class_c", + ), + ], + "class_a", + "class_c", + ), + ( + "class_a(class_b)>class_b<(class_b)class_c", + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ), + RelationPathElement( + type_=RelationPathElementType.PASSIVE, + source="class_b", + property="class_b", + target="class_c", + ), + ], + "class_a", + "class_c", + ), + ], +) +def test_reference_path( + path_str: str, + expected_elements: RelationPathElement, + expected_source: str, + expected_target: str, +): + """Test the RelationPath class.""" + + observed_path = RelationPath(path_str=path_str) + assert observed_path.elements == expected_elements + assert observed_path.source == expected_source + assert observed_path.target == expected_target + + +@pytest.mark.parametrize( + "path_str, is_valid", + [ + ("class_a(class_b)>class_b", True), + ("class_a<(class_a)class_b", True), + ("class_a(class_b)>class_b(class_c)>class_c", True), + (12312, False), + ("class_a<(class_b)>class_b", False), + ("(class_b)>class_b(class_c)>class_c", False), + ], +) +def test_reference_path_pydantic(path_str: str, is_valid: bool): + """Test the RelationPath class when used with pydantic.""" + + class ExampleModel(BaseModel): + """Some example model.""" + + path: RelationPath + + with nullcontext() if is_valid else pytest.raises(ValueError): + observed_path = ExampleModel(path=path_str).path # type: ignore + + if is_valid: + expected_path = RelationPath(path_str=path_str) + assert observed_path == expected_path diff --git a/tests/schemapack_/builtin_transformations/infer_relations/path/test_path_str.py b/tests/schemapack_/builtin_transformations/infer_relations/path/test_path_str.py new file mode 100644 index 00000000..003b0230 --- /dev/null +++ b/tests/schemapack_/builtin_transformations/infer_relations/path/test_path_str.py @@ -0,0 +1,324 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test the path_str module.""" + +from contextlib import nullcontext +from typing import Optional + +import pytest + +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_elements import ( + RelationPathElement, + RelationPathElementType, +) +from metldata.schemapack_.builtin_transformations.infer_relations.path.path_str import ( + ValidationError, + extract_first_element, + get_element_components, + get_element_type, + get_string_elements, + get_target_class, + path_str_to_object_elements, + split_first_element, + string_element_to_object, + validate_path_str_characters, + validate_path_str_format, + validate_string_element, +) + + +@pytest.mark.parametrize( + "path_str, is_valid", + [ + ("class_a(class_b)>class_b", True), + ("class_a<(class_a)class_b", True), + ("class_1(class_2)>class_2", True), + ("ClassA(class_b)>ClassB", True), + ("class-a(has-class_b)>class-b", False), + ("class_a.class_b>class_b", False), + ], +) +def test_validate_path_str_characters(path_str: str, is_valid: bool): + """Test the validate_path_str_characters method.""" + + with nullcontext() if is_valid else pytest.raises(ValidationError): + validate_path_str_characters(path_str) + + +@pytest.mark.parametrize( + "path_str, is_valid", + [ + ("class_a(class_b)>class_b", True), + ("class_a<(class_a)class_b", True), + ("class_a(class_b)>class_b(class_c)>class_c", True), + ("class_a<(class_a)class_b(class_c)>class_c", True), + ("class_a<(class_a)class_b<(class_b)class_c", True), + ( + "class_a(class_b)>class_b(class_c)>class_c(class_d)>class_d", + True, + ), + ("class_a<(class_b)>class_b", False), + ("class_a>class_b", False), + ("class_a>(class_a)class_b", False), + ("class_a(class_b)class_b(class_c)>", False), + ("(class_b)>class_b(class_c)>class_c", False), + ("class_a(class_b>class_b", False), + ], +) +def test_validate_path_str_format(path_str: str, is_valid: bool): + """Test the validate_path_str_format method.""" + + with nullcontext() if is_valid else pytest.raises(ValidationError): + validate_path_str_format(path_str) + + +@pytest.mark.parametrize( + "path_str, expected_first_element", + [ + ("class_a(class_b)>class_b", "class_a(class_b)>class_b"), + ("class_a<(class_a)class_b", "class_a<(class_a)class_b"), + ( + "class_a(class_b)>class_b(class_c)>class_c", + "class_a(class_b)>class_b", + ), + ], +) +def test_extract_first_element(path_str: str, expected_first_element: str): + """Test the extract_first_element method.""" + + observed_first_element = extract_first_element(path_str=path_str) + assert observed_first_element == expected_first_element + + +@pytest.mark.parametrize( + "path_str, expected_target_class", + [ + ("class_a(class_b)>class_b", "class_b"), + ("class_a<(class_a)class_b", "class_b"), + ( + "class_a(class_b)>class_b(class_c)>class_c", + "class_c", + ), + ], +) +def test_get_target_class(path_str: str, expected_target_class: str): + """Test the get_target_class method.""" + + observed_target_class = get_target_class(path_str=path_str) + assert observed_target_class == expected_target_class + + +@pytest.mark.parametrize( + "path_str, expected_first_element, expected_remaining_path", + [ + ("class_a(class_b)>class_b", "class_a(class_b)>class_b", None), + ("class_a<(class_a)class_b", "class_a<(class_a)class_b", None), + ( + "class_a(class_b)>class_b(class_c)>class_c", + "class_a(class_b)>class_b", + "class_b(class_c)>class_c", + ), + ], +) +def test_split_first_element( + path_str: str, expected_first_element: str, expected_remaining_path: Optional[str] +): + """Test the split_first_element method.""" + + observed_first_element, observed_remaining_path = split_first_element( + path_str=path_str + ) + assert observed_first_element == expected_first_element + assert observed_remaining_path == expected_remaining_path + + +@pytest.mark.parametrize( + "path_str, expected_elements", + [ + ("class_a(class_b)>class_b", ["class_a(class_b)>class_b"]), + ("class_a<(class_a)class_b", ["class_a<(class_a)class_b"]), + ( + "class_a(class_b)>class_b(class_c)>class_c", + [ + "class_a(class_b)>class_b", + "class_b(class_c)>class_c", + ], + ), + ], +) +def test_get_string_elements(path_str: str, expected_elements: list[str]): + """Test the get_string_elements method.""" + + observed_elements = get_string_elements(path_str=path_str) + assert observed_elements == expected_elements + + +@pytest.mark.parametrize( + "string_element, is_valid", + [ + ("class_a(class_b)>class_b", True), + ("class_a<(class_a)class_b", True), + ("class_a<(class_a)>class_b", False), + ("class_a>class_b", False), + ("class_a(class_b)>class_b(class_c)>class_c", False), + ], +) +def test_validate_string_element(string_element: str, is_valid: bool): + """Test the validate_string_element method.""" + + with nullcontext() if is_valid else pytest.raises(ValidationError): + validate_string_element(string_element) + + +@pytest.mark.parametrize( + "string_element, expected_type", + [ + ("class_a(class_b)>class_b", RelationPathElementType.ACTIVE), + ("class_a<(class_a)class_b", RelationPathElementType.PASSIVE), + ], +) +def test_get_element_type(string_element: str, expected_type: RelationPathElementType): + """Test the get_element_type method.""" + + observed_type = get_element_type(string_element=string_element) + assert observed_type == expected_type + + +@pytest.mark.parametrize( + "string_element, expected_source, expected_property, expected_target", + [ + ("class_a(class_b)>class_b", "class_a", "class_b", "class_b"), + ("class_a<(class_a)class_b", "class_a", "class_a", "class_b"), + ], +) +def test_get_element_components( + string_element: str, + expected_source: str, + expected_property: str, + expected_target: str, +): + """Test the get_element_components method.""" + + observed_source, observed_property, observed_target = get_element_components( + string_element=string_element + ) + assert observed_source == expected_source + assert observed_property == expected_property + assert observed_target == expected_target + + +@pytest.mark.parametrize( + "string_element, expected_object", + [ + ( + "class_a(class_b)>class_b", + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ), + ), + ( + "class_a<(class_a)class_b", + RelationPathElement( + type_=RelationPathElementType.PASSIVE, + source="class_a", + property="class_a", + target="class_b", + ), + ), + ], +) +def test_string_element_to_object( + string_element: str, expected_object: RelationPathElement +): + """Test the string_element_to_object method.""" + + observed_object = string_element_to_object(string_element) + assert observed_object == expected_object + + +@pytest.mark.parametrize( + "path_str, expected_elements", + [ + ( + "class_a(class_b)>class_b", + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ) + ], + ), + ( + "class_a<(class_a)class_b", + [ + RelationPathElement( + type_=RelationPathElementType.PASSIVE, + source="class_a", + property="class_a", + target="class_b", + ) + ], + ), + ( + "class_a(class_b)>class_b(class_c)>class_c", + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ), + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_b", + property="class_c", + target="class_c", + ), + ], + ), + ( + "class_a(class_b)>class_b<(class_b)class_c", + [ + RelationPathElement( + type_=RelationPathElementType.ACTIVE, + source="class_a", + property="class_b", + target="class_b", + ), + RelationPathElement( + type_=RelationPathElementType.PASSIVE, + source="class_b", + property="class_b", + target="class_c", + ), + ], + ), + ], +) +def test_path_str_to_object_elements( + path_str: str, expected_elements: RelationPathElement +): + """Test the path_str_to_object_elements method.""" + + observed_elements = path_str_to_object_elements(path_str) + assert observed_elements == expected_elements diff --git a/tests/schemapack_/builtin_transformations/test_happy.py b/tests/schemapack_/builtin_transformations/test_happy.py new file mode 100644 index 00000000..4b9bbf41 --- /dev/null +++ b/tests/schemapack_/builtin_transformations/test_happy.py @@ -0,0 +1,61 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Test the builtin transformations using pre-defined test cases.""" + +import pytest + +from metldata.schemapack_.transform.handling import TransformationHandler +from tests.schemapack_.fixtures.transformations import ( + TRANSFORMATION_TEST_CASES, + TransformationTestCase, +) + + +@pytest.mark.parametrize( + "test_case", + TRANSFORMATION_TEST_CASES, + ids=str, +) +def test_model_transformations( + test_case: TransformationTestCase, +): + """Test the happy path of transforming a model.""" + + handler = TransformationHandler( + transformation_definition=test_case.transformation_definition, + transformation_config=test_case.config, + input_model=test_case.input_model, + ) + transformed_model = handler.transformed_model + + assert transformed_model == test_case.transformed_model + + +@pytest.mark.parametrize("test_case", TRANSFORMATION_TEST_CASES, ids=str) +def test_data_transformations( + test_case: TransformationTestCase, +): + """Test the happy path of transforming data for a model.""" + + handler = TransformationHandler( + transformation_definition=test_case.transformation_definition, + transformation_config=test_case.config, + input_model=test_case.input_model, + ) + transformed_data = handler.transform_data(test_case.input_data) + + assert transformed_data == test_case.transformed_data diff --git a/tests/schemapack_/fixtures/data.py b/tests/schemapack_/fixtures/data.py index 7074529c..3f80b1f3 100644 --- a/tests/schemapack_/fixtures/data.py +++ b/tests/schemapack_/fixtures/data.py @@ -30,5 +30,6 @@ def _get_example_data(name: str) -> DataPack: return load_datapack(EXAMPLE_DATA_DIR / f"{name}.datapack.yaml") -VALID_MINIMAL_DATA = _get_example_data("valid_minimal") +MINIMAL_DATA = _get_example_data("minimal") +ADVANCED_DATA = _get_example_data("advanced") INVALID_MINIMAL_DATA = _get_example_data("invalid_minimal") diff --git a/tests/schemapack_/fixtures/example_content_schemas/Dataset.schema.json b/tests/schemapack_/fixtures/example_content_schemas/Dataset.schema.json new file mode 100644 index 00000000..9358f832 --- /dev/null +++ b/tests/schemapack_/fixtures/example_content_schemas/Dataset.schema.json @@ -0,0 +1,17 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A dataset that is a collection of files.", + "properties": { + "alias": { + "type": "string" + }, + "dac_contact": { + "type": "string" + } + }, + "required": [ + "alias" + ], + "type": "object" +} diff --git a/tests/schemapack_/fixtures/example_content_schemas/Experiment.schema.json b/tests/schemapack_/fixtures/example_content_schemas/Experiment.schema.json new file mode 100644 index 00000000..fdb2de4f --- /dev/null +++ b/tests/schemapack_/fixtures/example_content_schemas/Experiment.schema.json @@ -0,0 +1,17 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "An experiment containing one or multiple samples.", + "properties": { + "alias": { + "type": "string" + }, + "description": { + "type": "string" + } + }, + "required": [ + "alias" + ], + "type": "object" +} diff --git a/tests/schemapack_/fixtures/example_content_schemas/File.schema.json b/tests/schemapack_/fixtures/example_content_schemas/File.schema.json new file mode 100644 index 00000000..5ca385db --- /dev/null +++ b/tests/schemapack_/fixtures/example_content_schemas/File.schema.json @@ -0,0 +1,30 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A file is an object that contains information generated from a process, either an Experiment or an Analysis.", + "properties": { + "alias": { + "type": "string" + }, + "checksum": { + "type": "string" + }, + "filename": { + "type": "string" + }, + "format": { + "type": "string" + }, + "size": { + "type": "integer" + } + }, + "required": [ + "alias", + "filename", + "format", + "checksum", + "size" + ], + "type": "object" +} diff --git a/tests/schemapack_/fixtures/example_content_schemas/Sample.schema.json b/tests/schemapack_/fixtures/example_content_schemas/Sample.schema.json new file mode 100644 index 00000000..54880cbf --- /dev/null +++ b/tests/schemapack_/fixtures/example_content_schemas/Sample.schema.json @@ -0,0 +1,17 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A sample used to generate files in the context of an experiment.", + "properties": { + "alias": { + "type": "string" + }, + "description": { + "type": "string" + } + }, + "required": [ + "alias" + ], + "type": "object" +} diff --git a/tests/schemapack_/fixtures/example_data/advanced.datapack.yaml b/tests/schemapack_/fixtures/example_data/advanced.datapack.yaml new file mode 100644 index 00000000..2bee3e10 --- /dev/null +++ b/tests/schemapack_/fixtures/example_data/advanced.datapack.yaml @@ -0,0 +1,56 @@ +datapack: 0.1.0 +resources: + File: + file_a: + content: + alias: file_a + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + alias: file_b + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + alias: file_c + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + alias: dataset_1 + dac_contact: dac@example.org + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + alias: sample_x + relations: + files: + - file_a + - file_b + sample_y: + content: + alias: sample_y + relations: + files: + - file_c + Experiment: + experiment_i: + content: + alias: experiment_i + relations: + samples: + - sample_x + - sample_y diff --git a/tests/schemapack_/fixtures/example_data/minimal.datapack.yaml b/tests/schemapack_/fixtures/example_data/minimal.datapack.yaml new file mode 100644 index 00000000..21cca892 --- /dev/null +++ b/tests/schemapack_/fixtures/example_data/minimal.datapack.yaml @@ -0,0 +1,40 @@ +datapack: 0.1.0 +resources: + File: + example_file_a: + content: + alias: example_file_a + filename: example_file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + example_file_b: + content: + alias: example_file_b + filename: example_file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + example_file_c: + content: + alias: example_file_c + filename: example_file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + example_dataset_1: + content: + alias: example_dataset_1 + dac_contact: dac@example.org + relations: + files: + - example_file_a + - example_file_b + example_dataset_2: + content: + alias: example_dataset_2 + dac_contact: dac@example.org + relations: + files: + - example_file_c diff --git a/tests/schemapack_/fixtures/example_models/advanced.schemapack.yaml b/tests/schemapack_/fixtures/example_models/advanced.schemapack.yaml new file mode 100644 index 00000000..7039d474 --- /dev/null +++ b/tests/schemapack_/fixtures/example_models/advanced.schemapack.yaml @@ -0,0 +1,31 @@ +# a simple schemapack with the content schemas being embedded +schemapack: 0.1.0 +classes: + File: + id: + from_content: alias + content: ../example_content_schemas/File.schema.json + Dataset: + id: + from_content: alias + content: ../example_content_schemas/Dataset.schema.json + relations: + files: + to: File + cardinality: many_to_many + Sample: + id: + from_content: alias + content: ../example_content_schemas/Sample.schema.json + relations: + files: + to: File + cardinality: one_to_many + Experiment: + id: + from_content: alias + content: ../example_content_schemas/Experiment.schema.json + relations: + samples: + to: Sample + cardinality: one_to_many diff --git a/tests/schemapack_/fixtures/example_models/minimal.schemapack.yaml b/tests/schemapack_/fixtures/example_models/minimal.schemapack.yaml index d79663d6..421d5982 100644 --- a/tests/schemapack_/fixtures/example_models/minimal.schemapack.yaml +++ b/tests/schemapack_/fixtures/example_models/minimal.schemapack.yaml @@ -4,45 +4,11 @@ classes: File: id: from_content: alias - content: - "$schema": "http://json-schema.org/draft-07/schema#" - additionalProperties: false - description: A file is an object that contains information generated from a process, - either an Experiment or an Analysis. - properties: - alias: - type: string - checksum: - type: string - filename: - type: string - format: - type: string - size: - type: integer - required: - - alias - - filename - - format - - checksum - - size - type: object + content: ../example_content_schemas/File.schema.json Dataset: id: from_content: alias - content: - "$schema": "http://json-schema.org/draft-07/schema#" - additionalProperties: false - description: A dataset that is a collection of files. - properties: - alias: - type: string - dac_contact: - type: string - required: - - alias - type: object - + content: ../example_content_schemas/Dataset.schema.json relations: files: to: File diff --git a/tests/schemapack_/fixtures/example_transformations/Readme.md b/tests/schemapack_/fixtures/example_transformations/Readme.md new file mode 100644 index 00000000..1ab38d08 --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/Readme.md @@ -0,0 +1,33 @@ + + +This directory contains test cases for testing builtin transformation. + +Names of sub-directories correspond to the transformation names. +Each sub-sub-directory represents a test case. +A test case is defined by following four files: +- `config.yaml` - the transformation config +- `input.datapack.yaml` - the input data for this transformation, if not present, + the [../example_data/advanced.datapack.yaml](../example_data/advanced.datapack.yaml) + is used +- `input.schemapack.yaml` - the model for the input data, if not present, + the + [../example_models/advanced.schemapack.yaml](../example_models/advanced.schemapack.yaml) + is used +- `transformed.datapack.yaml` - the expected data output of the transformation +- `transformed.schemapack.yaml` - the expected model output of the transformation diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/config.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/config.yaml new file mode 100644 index 00000000..6c49a4ab --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/config.yaml @@ -0,0 +1,5 @@ +inferred_relations: + Experiment: + files: + path: "Experiment(samples)>Sample(files)>File" + allow_multiple: true diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/transformed.datapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/transformed.datapack.yaml new file mode 100644 index 00000000..e7787f8f --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/transformed.datapack.yaml @@ -0,0 +1,60 @@ +datapack: 0.1.0 +resources: + File: + file_a: + content: + alias: file_a + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + alias: file_b + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + alias: file_c + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + alias: dataset_1 + dac_contact: dac@example.org + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + alias: sample_x + relations: + files: + - file_a + - file_b + sample_y: + content: + alias: sample_y + relations: + files: + - file_c + Experiment: + experiment_i: + content: + alias: experiment_i + relations: + samples: + - sample_x + - sample_y + files: # <- + - file_a + - file_b + - file_c diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/transformed.schemapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/transformed.schemapack.yaml new file mode 100644 index 00000000..f9631470 --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/active_relations/transformed.schemapack.yaml @@ -0,0 +1,33 @@ +schemapack: 0.1.0 +classes: + File: + id: + from_content: alias + content: ../../../example_content_schemas/File.schema.json + Dataset: + id: + from_content: alias + content: ../../../example_content_schemas/Dataset.schema.json + relations: + files: + to: File + cardinality: many_to_many + Sample: + id: + from_content: alias + content: ../../../example_content_schemas/Sample.schema.json + relations: + files: + to: File + cardinality: one_to_many + Experiment: + id: + from_content: alias + content: ../../../example_content_schemas/Experiment.schema.json + relations: + samples: + to: Sample + cardinality: one_to_many + files: # <- + to: File + cardinality: many_to_many diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/config.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/config.yaml new file mode 100644 index 00000000..96476277 --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/config.yaml @@ -0,0 +1,5 @@ +inferred_relations: + Dataset: + samples: + path: "Dataset(files)>File<(files)Sample" + allow_multiple: true diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/transformed.datapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/transformed.datapack.yaml new file mode 100644 index 00000000..d61ec4cf --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/transformed.datapack.yaml @@ -0,0 +1,59 @@ +datapack: 0.1.0 +resources: + File: + file_a: + content: + alias: file_a + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + alias: file_b + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + alias: file_c + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + alias: dataset_1 + dac_contact: dac@example.org + relations: + files: + - file_a + - file_b + - file_c + samples: + - sample_x + - sample_y + Sample: + sample_x: + content: + alias: sample_x + relations: + files: + - file_a + - file_b + sample_y: + content: + alias: sample_y + relations: + files: + - file_c + Experiment: + experiment_i: + content: + alias: experiment_i + relations: + samples: + - sample_x + - sample_y diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/transformed.schemapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/transformed.schemapack.yaml new file mode 100644 index 00000000..9d3e1d7a --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/complex_relations/transformed.schemapack.yaml @@ -0,0 +1,33 @@ +schemapack: 0.1.0 +classes: + File: + id: + from_content: alias + content: ../../../example_content_schemas/File.schema.json + Dataset: + id: + from_content: alias + content: ../../../example_content_schemas/Dataset.schema.json + relations: + files: + to: File + cardinality: many_to_many + samples: # <- + to: Sample + cardinality: many_to_many + Sample: + id: + from_content: alias + content: ../../../example_content_schemas/Sample.schema.json + relations: + files: + to: File + cardinality: one_to_many + Experiment: + id: + from_content: alias + content: ../../../example_content_schemas/Experiment.schema.json + relations: + samples: + to: Sample + cardinality: one_to_many diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/config.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/config.yaml new file mode 100644 index 00000000..e739f565 --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/config.yaml @@ -0,0 +1,5 @@ +inferred_relations: + File: + datasets: + path: "File<(files)Dataset" + allow_multiple: true diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/input.datapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/input.datapack.yaml new file mode 100644 index 00000000..62d45432 --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/input.datapack.yaml @@ -0,0 +1,42 @@ +datapack: 0.1.0 +resources: + File: + file_a: + content: + alias: file_a + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + alias: file_b + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + alias: file_c + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + alias: dataset_1 + dac_contact: dac@example.org + relations: + files: + - file_a + - file_b + dataset_2: + content: + alias: dataset_2 + dac_contact: dac@example.org + relations: + files: + - file_a + Sample: {} + Experiment: {} diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/transformed.datapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/transformed.datapack.yaml new file mode 100644 index 00000000..9522b0be --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/transformed.datapack.yaml @@ -0,0 +1,51 @@ +datapack: 0.1.0 +resources: + File: + file_a: + content: + alias: file_a + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + relations: # <- + datasets: + - dataset_1 + - dataset_2 + file_b: + content: + alias: file_b + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + relations: # <- + datasets: + - dataset_1 + file_c: + content: + alias: file_c + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + relations: # <- + datasets: [] + Dataset: + dataset_1: + content: + alias: dataset_1 + dac_contact: dac@example.org + relations: + files: + - file_a + - file_b + dataset_2: + content: + alias: dataset_2 + dac_contact: dac@example.org + relations: + files: + - file_a + Sample: {} + Experiment: {} diff --git a/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/transformed.schemapack.yaml b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/transformed.schemapack.yaml new file mode 100644 index 00000000..1209e546 --- /dev/null +++ b/tests/schemapack_/fixtures/example_transformations/infer_relations/passive_relations/transformed.schemapack.yaml @@ -0,0 +1,34 @@ +schemapack: 0.1.0 +classes: + File: + id: + from_content: alias + content: ../../../example_content_schemas/File.schema.json + relations: # <- + datasets: + to: Dataset + cardinality: many_to_many + Dataset: + id: + from_content: alias + content: ../../../example_content_schemas/Dataset.schema.json + relations: + files: + to: File + cardinality: many_to_many + Sample: + id: + from_content: alias + content: ../../../example_content_schemas/Sample.schema.json + relations: + files: + to: File + cardinality: one_to_many + Experiment: + id: + from_content: alias + content: ../../../example_content_schemas/Experiment.schema.json + relations: + samples: + to: Sample + cardinality: one_to_many diff --git a/tests/schemapack_/fixtures/models.py b/tests/schemapack_/fixtures/models.py index dd9b514d..ccb49ef8 100644 --- a/tests/schemapack_/fixtures/models.py +++ b/tests/schemapack_/fixtures/models.py @@ -22,7 +22,6 @@ from tests.schemapack_.fixtures.utils import BASE_DIR EXAMPLE_MODEL_DIR = BASE_DIR / "example_models" -VALID_MINIMAL_MODEL_EXAMPLE_PATH = EXAMPLE_MODEL_DIR / "minimal_model.yaml" def _get_example_model(name: str) -> SchemaPack: @@ -31,5 +30,6 @@ def _get_example_model(name: str) -> SchemaPack: return load_schemapack(EXAMPLE_MODEL_DIR / f"{name}.schemapack.yaml") -VALID_MINIMAL_MODEL = _get_example_model("minimal") -VALID_MODELS = [VALID_MINIMAL_MODEL] +MINIMAL_MODEL = _get_example_model("minimal") +ADVANCED_MODEL = _get_example_model("advanced") +VALID_MODELS = [MINIMAL_MODEL] diff --git a/tests/schemapack_/fixtures/transformations.py b/tests/schemapack_/fixtures/transformations.py new file mode 100644 index 00000000..38fd1d57 --- /dev/null +++ b/tests/schemapack_/fixtures/transformations.py @@ -0,0 +1,115 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Transformation test cases.""" + +from dataclasses import dataclass + +from pydantic import BaseModel +from schemapack.load import load_datapack, load_schemapack +from schemapack.spec.datapack import DataPack +from schemapack.spec.schemapack import SchemaPack + +from metldata.schemapack_.builtin_transformations.infer_relations import ( + RELATION_INFERENCE_TRANSFORMATION, +) +from metldata.schemapack_.transform.base import TransformationDefinition +from tests.schemapack_.fixtures.data import ADVANCED_DATA +from tests.schemapack_.fixtures.models import ADVANCED_MODEL +from tests.schemapack_.fixtures.utils import BASE_DIR, read_yaml + +EXAMPLE_TRANSFORMATION_DIR = BASE_DIR / "example_transformations" + +TRANSFORMATIONS_BY_NAME: dict[str, TransformationDefinition] = { + "infer_relations": RELATION_INFERENCE_TRANSFORMATION, +} + + +@dataclass(frozen=True) +class TransformationTestCase: + """A test case for a transformation.""" + + transformation_name: str + case_name: str + transformation_definition: TransformationDefinition + config: BaseModel + input_model: SchemaPack + input_data: DataPack + transformed_model: SchemaPack + transformed_data: DataPack + + def __str__(self) -> str: # noqa: D105 + return f"{self.transformation_name}-{self.case_name}" + + +def _read_test_case( + *, + transformation_name: str, + case_name: str, +) -> TransformationTestCase: + """Read a test case for a transformation.""" + + transformation_definition = TRANSFORMATIONS_BY_NAME[transformation_name] + + case_dir = EXAMPLE_TRANSFORMATION_DIR / transformation_name / case_name + config_path = case_dir / "config.yaml" + input_model_path = case_dir / "input.schemapack.yaml" + input_data_path = case_dir / "input.datapack.yaml" + transformed_model_path = case_dir / "transformed.schemapack.yaml" + transformed_data_path = case_dir / "transformed.datapack.yaml" + + input_model = ( + load_schemapack(input_model_path) + if input_model_path.exists() + else ADVANCED_MODEL + ) + input_data = ( + load_datapack(input_data_path) if input_data_path.exists() else ADVANCED_DATA + ) + transformed_model = load_schemapack(transformed_model_path) + transformed_data = load_datapack(transformed_data_path) + config = transformation_definition.config_cls(**read_yaml(config_path)) + + return TransformationTestCase( + transformation_name=transformation_name, + case_name=case_name, + transformation_definition=transformation_definition, + config=config, + input_model=input_model, + input_data=input_data, + transformed_model=transformed_model, + transformed_data=transformed_data, + ) + + +def _read_all_test_cases() -> list[TransformationTestCase]: + """Read all test cases for a transformation.""" + + return [ + _read_test_case( + transformation_name=transformation_name, + case_name=case_name, + ) + for transformation_name in TRANSFORMATIONS_BY_NAME + for case_name in [ + path.name + for path in (EXAMPLE_TRANSFORMATION_DIR / transformation_name).iterdir() + if path.is_dir() + ] + ] + + +TRANSFORMATION_TEST_CASES = _read_all_test_cases() diff --git a/tests/schemapack_/transform/test_handling.py b/tests/schemapack_/transform/test_handling.py index e4248df2..de071971 100644 --- a/tests/schemapack_/transform/test_handling.py +++ b/tests/schemapack_/transform/test_handling.py @@ -18,7 +18,6 @@ with builtin transformations are tested here.""" import pytest -import schemapack.exceptions from schemapack.spec.datapack import DataPack from schemapack.spec.schemapack import SchemaPack @@ -33,11 +32,13 @@ WorkflowStep, ) from metldata.schemapack_.transform.handling import ( + PostTransformValidationError, + PreTransformValidationError, TransformationHandler, WorkflowHandler, ) -from tests.schemapack_.fixtures.data import INVALID_MINIMAL_DATA, VALID_MINIMAL_DATA -from tests.schemapack_.fixtures.models import VALID_MINIMAL_MODEL +from tests.schemapack_.fixtures.data import INVALID_MINIMAL_DATA, MINIMAL_DATA +from tests.schemapack_.fixtures.models import MINIMAL_MODEL def test_transformation_handler_happy(): @@ -46,16 +47,16 @@ def test_transformation_handler_happy(): transformation_handler = TransformationHandler( transformation_definition=NULL_TRANSFORMATION, transformation_config=NullConfig(), - original_model=VALID_MINIMAL_MODEL, + input_model=MINIMAL_MODEL, ) # Since the null transformation was used, compare with the input: - assert transformation_handler.transformed_model == VALID_MINIMAL_MODEL + assert transformation_handler.transformed_model == MINIMAL_MODEL - transformed_data = transformation_handler.transform_data(VALID_MINIMAL_DATA) + transformed_data = transformation_handler.transform_data(MINIMAL_DATA) # Since the null transformation was used, compare with the input: - assert transformed_data == VALID_MINIMAL_DATA + assert transformed_data == MINIMAL_DATA def test_transformation_handler_assumption_error(): @@ -77,7 +78,7 @@ def always_failing_assumptions(model: SchemaPack, config: NullConfig): _ = TransformationHandler( transformation_definition=transformation, transformation_config=NullConfig(), - original_model=VALID_MINIMAL_MODEL, + input_model=MINIMAL_MODEL, ) @@ -85,7 +86,7 @@ def test_transformation_handler_model_transformation_error(): """Test using the TransformationHandling when model transformation fails.""" # make transformation definition always raise an ModelAssumptionError: - def always_failing_transformation(original_model: SchemaPack, config: NullConfig): + def always_failing_transformation(input_model: SchemaPack, config: NullConfig): """A function that always raises a ModelTransformationError.""" raise ModelTransformationError @@ -99,7 +100,7 @@ def always_failing_transformation(original_model: SchemaPack, config: NullConfig _ = TransformationHandler( transformation_definition=transformation, transformation_config=NullConfig(), - original_model=VALID_MINIMAL_MODEL, + input_model=MINIMAL_MODEL, ) @@ -110,10 +111,10 @@ def test_transformation_handler_input_data_invalid(): transformation_handler = TransformationHandler( transformation_definition=NULL_TRANSFORMATION, transformation_config=NullConfig(), - original_model=VALID_MINIMAL_MODEL, + input_model=MINIMAL_MODEL, ) - with pytest.raises(schemapack.exceptions.ValidationError): + with pytest.raises(PreTransformValidationError): _ = transformation_handler.transform_data(INVALID_MINIMAL_DATA) @@ -146,11 +147,11 @@ def transform(self, data: DataPack) -> DataPack: transformation_handler = TransformationHandler( transformation_definition=transformation, transformation_config=NullConfig(), - original_model=VALID_MINIMAL_MODEL, + input_model=MINIMAL_MODEL, ) - with pytest.raises(schemapack.exceptions.ValidationError): - _ = transformation_handler.transform_data(VALID_MINIMAL_DATA) + with pytest.raises(PostTransformValidationError): + _ = transformation_handler.transform_data(MINIMAL_DATA) def test_workflow_handler_happy(): @@ -180,10 +181,10 @@ def test_workflow_handler_happy(): workflow_config=null_workflow.config_cls.model_validate( {"step1": {}, "step2": {}} ), - original_model=VALID_MINIMAL_MODEL, + input_model=MINIMAL_MODEL, ) - artifacts = workflow_handler.run(data=VALID_MINIMAL_DATA) + artifacts = workflow_handler.run(data=MINIMAL_DATA) # Since a null workflow was used, compare to the input: - assert artifacts["step1_output"] == artifacts["step2_output"] == VALID_MINIMAL_DATA + assert artifacts["step1_output"] == artifacts["step2_output"] == MINIMAL_DATA