From 01db23a1790531e496e4d1918ae20a99d540a50d Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 16 Jul 2024 07:52:09 +0000 Subject: [PATCH 01/15] template files for count references transformation --- .../path/__init__.py | 0 .../{infer_relations => common}/path/path.py | 2 +- .../path/path_elements.py | 0 .../path/path_str.py | 2 +- .../count_references/__init__.py | 20 ++++ .../count_references/assumptions.py | 63 ++++++++++++ .../count_references/config.py | 32 ++++++ .../count_references/data_transform.py | 37 +++++++ .../count_references/main.py | 77 +++++++++++++++ .../count_references/model_transform.py | 38 +++++++ .../infer_relations/assumptions.py | 4 +- .../infer_relations/relations.py | 2 +- .../count_references/multiple/config.yaml | 31 ++++++ .../multiple/transformed.datapack.yaml | 58 +++++++++++ .../multiple/transformed.schemapack.yaml | 98 +++++++++++++++++++ tests/fixtures/transformations.py | 4 + 16 files changed, 463 insertions(+), 5 deletions(-) rename src/metldata/builtin_transformations/{infer_relations => common}/path/__init__.py (100%) rename src/metldata/builtin_transformations/{infer_relations => common}/path/path.py (98%) rename src/metldata/builtin_transformations/{infer_relations => common}/path/path_elements.py (100%) rename src/metldata/builtin_transformations/{infer_relations => common}/path/path_str.py (98%) create mode 100644 src/metldata/builtin_transformations/count_references/__init__.py create mode 100644 src/metldata/builtin_transformations/count_references/assumptions.py create mode 100644 src/metldata/builtin_transformations/count_references/config.py create mode 100644 src/metldata/builtin_transformations/count_references/data_transform.py create mode 100644 src/metldata/builtin_transformations/count_references/main.py create mode 100644 src/metldata/builtin_transformations/count_references/model_transform.py create mode 100644 tests/fixtures/example_transformations/count_references/multiple/config.yaml create mode 100644 tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml create mode 100644 tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml diff --git a/src/metldata/builtin_transformations/infer_relations/path/__init__.py b/src/metldata/builtin_transformations/common/path/__init__.py similarity index 100% rename from src/metldata/builtin_transformations/infer_relations/path/__init__.py rename to src/metldata/builtin_transformations/common/path/__init__.py diff --git a/src/metldata/builtin_transformations/infer_relations/path/path.py b/src/metldata/builtin_transformations/common/path/path.py similarity index 98% rename from src/metldata/builtin_transformations/infer_relations/path/path.py rename to src/metldata/builtin_transformations/common/path/path.py index f6ac3cf..fefba31 100644 --- a/src/metldata/builtin_transformations/infer_relations/path/path.py +++ b/src/metldata/builtin_transformations/common/path/path.py @@ -18,7 +18,7 @@ from pydantic import GetJsonSchemaHandler, ValidationInfo -from metldata.builtin_transformations.infer_relations.path.path_str import ( +from metldata.builtin_transformations.common.path.path_str import ( PATH_PATTERN, ValidationError, clean_path_str, diff --git a/src/metldata/builtin_transformations/infer_relations/path/path_elements.py b/src/metldata/builtin_transformations/common/path/path_elements.py similarity index 100% rename from src/metldata/builtin_transformations/infer_relations/path/path_elements.py rename to src/metldata/builtin_transformations/common/path/path_elements.py diff --git a/src/metldata/builtin_transformations/infer_relations/path/path_str.py b/src/metldata/builtin_transformations/common/path/path_str.py similarity index 98% rename from src/metldata/builtin_transformations/infer_relations/path/path_str.py rename to src/metldata/builtin_transformations/common/path/path_str.py index 6d3e548..8cf26ec 100644 --- a/src/metldata/builtin_transformations/infer_relations/path/path_str.py +++ b/src/metldata/builtin_transformations/common/path/path_str.py @@ -18,7 +18,7 @@ import re -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) diff --git a/src/metldata/builtin_transformations/count_references/__init__.py b/src/metldata/builtin_transformations/count_references/__init__.py new file mode 100644 index 0000000..ab70fc1 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A transformation to count the references.""" + +# shortcuts: +from metldata.builtin_transformations.count_references.main import ( # noqa: F401 + COUNT_REFERENCES_TRANSFORMATION, +) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py new file mode 100644 index 0000000..e5eb974 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -0,0 +1,63 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"Assumptions for count references transformation" + +from typing import Any + +from schemapack.spec.schemapack import SchemaPack + +from metldata.builtin_transformations.common.path.path import RelationPath +from metldata.builtin_transformations.common.path.path_elements import ( + RelationPathElementType, +) +from metldata.transform.base import ModelAssumptionError + + +def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPath): + """Make sure that all classes and relations defined in the provided path exist in + the provided model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + for path_element in path.elements: + if path_element.source not in model.classes: + raise ModelAssumptionError( + f"Class {path_element.source} not found in model." + ) + + if path_element.target not in model.classes: + raise ModelAssumptionError( + f"Class {path_element.target} not found in model." + ) + + if path_element.type_ == RelationPathElementType.ACTIVE: + if ( + path_element.property + not in model.classes[path_element.source].relations + ): + raise ModelAssumptionError( + f"Relation property {path_element.property} not found in class" + f" {path_element.source}." + ) + + return + + +def check_model_assumptions(schema: SchemaPack, instructions_by_class: Any) -> None: + """Check the model assumptions for the count references transformation.""" + return None diff --git a/src/metldata/builtin_transformations/count_references/config.py b/src/metldata/builtin_transformations/count_references/config.py new file mode 100644 index 0000000..4aec28e --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/config.py @@ -0,0 +1,32 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Models used to describe count content properties that shall be calculated and added.""" + +from typing import Any + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class CountReferencesConfig(BaseSettings): + """Config containing content properties to be deleted from models and data.""" + + model_config = SettingsConfigDict(extra="forbid") + + count_references: Any = Field( + ..., + description=("description"), + examples=[], + ) diff --git a/src/metldata/builtin_transformations/count_references/data_transform.py b/src/metldata/builtin_transformations/count_references/data_transform.py new file mode 100644 index 0000000..106dfe4 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/data_transform.py @@ -0,0 +1,37 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Logic for transforming data.""" + +from schemapack.spec.datapack import DataPack + +# from metldata.transform.base import EvitableTransformationError + + +def count_references(*, data: DataPack) -> DataPack: + """Count + + Args: + data: + + + Returns: + The data with + """ + modified_data = data.model_copy(deep=True) + + # TODO modifications + + return modified_data diff --git a/src/metldata/builtin_transformations/count_references/main.py b/src/metldata/builtin_transformations/count_references/main.py new file mode 100644 index 0000000..4445a40 --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/main.py @@ -0,0 +1,77 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A transformation to count references.""" + +from schemapack.spec.datapack import DataPack +from schemapack.spec.schemapack import SchemaPack + +from metldata.builtin_transformations.count_references.assumptions import ( + check_model_assumptions, +) +from metldata.builtin_transformations.count_references.config import ( + CountReferencesConfig, +) +from metldata.builtin_transformations.count_references.data_transform import ( + count_references, +) +from metldata.builtin_transformations.count_references.model_transform import ( + add_count_references, +) +from metldata.transform.base import DataTransformer, TransformationDefinition + + +class CountReferencesTransformer(DataTransformer[CountReferencesConfig]): + """A transformer that counts the references and adds them to content properties.""" + + def transform(self, data: DataPack) -> DataPack: + """Transforms data. + + Args: + data: The data as DataPack to be transformed. + """ + return count_references(data=data) + + +def check_model_assumptions_wrapper( + model: SchemaPack, config: CountReferencesConfig +) -> None: + """Check the assumptions of the model. + + Raises: + ModelAssumptionError: + if the model does not fulfill the assumptions. + """ + check_model_assumptions(schema=model, instructions_by_class=config.count_references) + + +def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaPack: + """Transform the data model. + + Raises: + DataModelTransformationError: + if the transformation fails. + """ + return add_count_references( + model=model, instructions_by_class=config.count_references + ) + + +COUNT_REFERENCES_TRANSFORMATION = TransformationDefinition[CountReferencesConfig]( + config_cls=CountReferencesConfig, + check_model_assumptions=check_model_assumptions_wrapper, + transform_model=transform_model, + data_transformer_factory=CountReferencesTransformer, +) diff --git a/src/metldata/builtin_transformations/count_references/model_transform.py b/src/metldata/builtin_transformations/count_references/model_transform.py new file mode 100644 index 0000000..e0db6be --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/model_transform.py @@ -0,0 +1,38 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Model transformation logic for the 'count references' transformation""" + +from schemapack.spec.schemapack import ( + # ClassDefinition, + SchemaPack, +) + +# from metldata.transform.base import EvitableTransformationError + + +def add_count_references( + *, model: SchemaPack, instructions_by_class: dict[str, list[str]] +) -> SchemaPack: + """Delete content properties from a model. + + Args: + model: + The model based on SchemaPack to + Returns: + The model with the + """ + # TODO model transform logic for count references + + return model diff --git a/src/metldata/builtin_transformations/infer_relations/assumptions.py b/src/metldata/builtin_transformations/infer_relations/assumptions.py index fa4836e..a933cbd 100644 --- a/src/metldata/builtin_transformations/infer_relations/assumptions.py +++ b/src/metldata/builtin_transformations/infer_relations/assumptions.py @@ -18,10 +18,10 @@ from schemapack.spec.schemapack import SchemaPack -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElementType, ) from metldata.builtin_transformations.infer_relations.relations import ( diff --git a/src/metldata/builtin_transformations/infer_relations/relations.py b/src/metldata/builtin_transformations/infer_relations/relations.py index 962089b..abff442 100644 --- a/src/metldata/builtin_transformations/infer_relations/relations.py +++ b/src/metldata/builtin_transformations/infer_relations/relations.py @@ -18,7 +18,7 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) diff --git a/tests/fixtures/example_transformations/count_references/multiple/config.yaml b/tests/fixtures/example_transformations/count_references/multiple/config.yaml new file mode 100644 index 0000000..bfdf28f --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/config.yaml @@ -0,0 +1,31 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +count_references: +- class_name: Dataset + target_content: + object_path: "file_summary" + property_name: "count" + source_relation_path: "Dataset(files)>File" +- class_name: Sample + target_content: + object_path: "file_summary" + property_name: "count" + source_relation_path: "Sample(files)>File" +- class_name: Experiment + target_content: + object_path: "sample_summary" + property_name: "count" + source_relation_path: "Experiment(samples)>Sample" diff --git a/tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml new file mode 100644 index 0000000..09aead8 --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/transformed.datapack.yaml @@ -0,0 +1,58 @@ +datapack: 0.3.0 +resources: + File: + file_a: + content: + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + dac_contact: dac@example.org + file_summary: # <- + count: 3 + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + description: Some sample. + file_summary: # <- + count: 2 + relations: + files: + - file_a + - file_b + sample_y: + content: + file_summary: # <- + count: 1 + relations: + files: + - file_c + Experiment: + experiment_i: + content: + sample_summary: # <- + count: 2 + relations: + samples: + - sample_x + - sample_y diff --git a/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml new file mode 100644 index 0000000..9aa8c86 --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml @@ -0,0 +1,98 @@ +# a simple schemapack: +schemapack: 0.3.0 +classes: + File: + id: + propertyName: alias + content: ../example_content_schemas/File.schema.json + Dataset: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A dataset that is a collection of files.", + "properties": + { + "dac_contact": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: true + target: true + mandatory: + origin: false + target: true + Sample: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A sample used to generate files in the context of an experiment.", + "properties": + { + "description": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + "type": "object", + }, + } + relations: + files: + targetClass: File + multiple: + origin: false + target: true + mandatory: + origin: false + target: true + Experiment: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "An experiment containing one or multiple samples.", + "properties": + { + "description": { "type": "string" }, + "sample_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + "type": "object", + }, + "type": "object", + } + relations: + samples: + targetClass: Sample + multiple: + origin: false + target: true + mandatory: + origin: true + target: true diff --git a/tests/fixtures/transformations.py b/tests/fixtures/transformations.py index c5a0eb8..51c7dc2 100644 --- a/tests/fixtures/transformations.py +++ b/tests/fixtures/transformations.py @@ -26,6 +26,9 @@ from metldata.builtin_transformations.add_content_properties import ( ADD_CONTENT_PROPERTIES_TRANSFORMATION, ) +from metldata.builtin_transformations.count_references import ( + COUNT_REFERENCES_TRANSFORMATION, +) from metldata.builtin_transformations.delete_properties import ( PROPERTY_DELETION_TRANSFORMATION, ) @@ -43,6 +46,7 @@ "infer_relations": RELATION_INFERENCE_TRANSFORMATION, "delete_properties": PROPERTY_DELETION_TRANSFORMATION, "add_content_properties": ADD_CONTENT_PROPERTIES_TRANSFORMATION, + "count_references": COUNT_REFERENCES_TRANSFORMATION, } From 6d728f7ebb1ac2f8b2a5d0fea7032b4b6f9915bf Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 16 Jul 2024 11:31:17 +0000 Subject: [PATCH 02/15] count references --- .../count_references/assumptions.py | 36 +++++++++++ .../count_references/instruction.py | 59 +++++++++++++++++++ .../{infer_relations => common}/__init__.py | 0 .../path/__init__.py | 0 .../path/test_config.py | 6 +- .../path/test_path.py | 4 +- .../path/test_path_str.py | 4 +- 7 files changed, 102 insertions(+), 7 deletions(-) create mode 100644 src/metldata/builtin_transformations/count_references/instruction.py rename tests/builtin_transformations/{infer_relations => common}/__init__.py (100%) rename tests/builtin_transformations/{infer_relations => common}/path/__init__.py (100%) rename tests/builtin_transformations/{infer_relations => common}/path/test_config.py (96%) rename tests/builtin_transformations/{infer_relations => common}/path/test_path.py (96%) rename tests/builtin_transformations/{infer_relations => common}/path/test_path_str.py (98%) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index e5eb974..df24970 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -23,6 +23,9 @@ from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElementType, ) +from metldata.builtin_transformations.count_references.instruction import ( + AddCountPropertyInstruction, +) from metldata.transform.base import ModelAssumptionError @@ -58,6 +61,39 @@ def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPat return +def assert_new_property_not_exists( + schema: SchemaPack, + instructions_by_class: dict[str, list[AddCountPropertyInstruction]], +) -> None: + """Check the model assumptions for the add content properties transformation.""" + # the existence of the class getting the new property is already checked in the previous assumption. + for class_name, instructions in instructions_by_class.items(): + # class_def = schema.classes.get(class_name) + + # # Check if the class exists in the model + # if not class_def: + # raise ModelAssumptionError( + # f"Class {class_name} does not exist in the model." + # ) + + for instruction in instructions: + # Check if the property already exists in the target schema + try: + target_schema = resolve_schema_object_path( + json_schema=class_def.content.json_schema_dict, + path=instruction.target_content.object_path, + ) + except KeyError: + continue + if instruction.target_content.property_name in target_schema.get( + "properties", {} + ): + raise ModelAssumptionError( + f"Property {instruction.target_content.property_name} already exists" + + f" in class {class_name}." + ) + + def check_model_assumptions(schema: SchemaPack, instructions_by_class: Any) -> None: """Check the model assumptions for the count references transformation.""" return None diff --git a/src/metldata/builtin_transformations/count_references/instruction.py b/src/metldata/builtin_transformations/count_references/instruction.py new file mode 100644 index 0000000..4456dbb --- /dev/null +++ b/src/metldata/builtin_transformations/count_references/instruction.py @@ -0,0 +1,59 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Models for instructions used in the 'add content properties' transformation.""" + +from typing import Any, Final + +from pydantic import Field +from pydantic_settings import BaseSettings + +from metldata.builtin_transformations.common import NewContentSchemaPath + +DEFAULT_REFERENCE_COUNT_SCHEMA: Final[dict[str, Any]] = { + "type": "object", + "additionalProperties": False, + "properties": {"count": {"type": "integer"}}, + "required": ["count"], +} + + +class AddCountPropertyInstruction(BaseSettings): + """A model describing an instruction to + TODO reference_count_schema don't have to be exposed + """ + + class_name: str = Field(..., description="The name of the class to modify.") + + target_content_path: NewContentSchemaPath = Field( + ..., + description="A NewContentSchemaPath that describes a path to an already" + + " existing object within the content schema and the name of a property to be" + + " added to that object's schema", + ) + + required: bool = Field( + True, + description=( + "Indicates whether the newly added property shall be added to the" + + " 'required' list of the corresponding object. Defaults to 'True'." + ), + ) + + reference_count_schema: dict[str, Any] = Field( + DEFAULT_REFERENCE_COUNT_SCHEMA, + description="The JSON schema of the newly added property.", + ) + + # value: Any = Field({}, description="Value needs to be calculated") diff --git a/tests/builtin_transformations/infer_relations/__init__.py b/tests/builtin_transformations/common/__init__.py similarity index 100% rename from tests/builtin_transformations/infer_relations/__init__.py rename to tests/builtin_transformations/common/__init__.py diff --git a/tests/builtin_transformations/infer_relations/path/__init__.py b/tests/builtin_transformations/common/path/__init__.py similarity index 100% rename from tests/builtin_transformations/infer_relations/path/__init__.py rename to tests/builtin_transformations/common/path/__init__.py diff --git a/tests/builtin_transformations/infer_relations/path/test_config.py b/tests/builtin_transformations/common/path/test_config.py similarity index 96% rename from tests/builtin_transformations/infer_relations/path/test_config.py rename to tests/builtin_transformations/common/path/test_config.py index 803332a..4578dc3 100644 --- a/tests/builtin_transformations/infer_relations/path/test_config.py +++ b/tests/builtin_transformations/common/path/test_config.py @@ -16,12 +16,12 @@ """Test relations utils.""" +from metldata.builtin_transformations.common.path.path import ( + RelationPath, +) from metldata.builtin_transformations.infer_relations.config import ( RelationInferenceConfig, ) -from metldata.builtin_transformations.infer_relations.path.path import ( - RelationPath, -) from metldata.builtin_transformations.infer_relations.relations import ( InferenceInstruction, ) diff --git a/tests/builtin_transformations/infer_relations/path/test_path.py b/tests/builtin_transformations/common/path/test_path.py similarity index 96% rename from tests/builtin_transformations/infer_relations/path/test_path.py rename to tests/builtin_transformations/common/path/test_path.py index 0dce021..fbfa1d5 100644 --- a/tests/builtin_transformations/infer_relations/path/test_path.py +++ b/tests/builtin_transformations/common/path/test_path.py @@ -21,10 +21,10 @@ import pytest from pydantic import BaseModel -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) diff --git a/tests/builtin_transformations/infer_relations/path/test_path_str.py b/tests/builtin_transformations/common/path/test_path_str.py similarity index 98% rename from tests/builtin_transformations/infer_relations/path/test_path_str.py rename to tests/builtin_transformations/common/path/test_path_str.py index 3cb06ee..5baf13c 100644 --- a/tests/builtin_transformations/infer_relations/path/test_path_str.py +++ b/tests/builtin_transformations/common/path/test_path_str.py @@ -20,11 +20,11 @@ import pytest -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) -from metldata.builtin_transformations.infer_relations.path.path_str import ( +from metldata.builtin_transformations.common.path.path_str import ( ValidationError, extract_first_element, get_element_components, From a09db581a63dfbcc7fec13e0bc600b426c9da85a Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 16 Jul 2024 13:45:21 +0000 Subject: [PATCH 03/15] count references assumptions implemented --- .../count_references/assumptions.py | 103 ++++++++++-------- .../count_references/config.py | 8 +- .../count_references/instruction.py | 35 ++---- .../count_references/main.py | 2 +- 4 files changed, 72 insertions(+), 76 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index df24970..88de597 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -19,16 +19,29 @@ from schemapack.spec.schemapack import SchemaPack +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_schema_object_path, +) from metldata.builtin_transformations.common.path.path import RelationPath from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElementType, ) from metldata.builtin_transformations.count_references.instruction import ( - AddCountPropertyInstruction, + AddReferenceCountPropertyInstruction, ) from metldata.transform.base import ModelAssumptionError +def assert_class_is_source( + model: SchemaPack, instruction: AddReferenceCountPropertyInstruction +): + """Make sure that the source class is the one being modified with the count property""" + if instruction.class_name != instruction.source_relation_path.source: + raise ModelAssumptionError( + f"Class {instruction.class_name} does not correspond to the relation source {instruction.source_relation_path.source}." + ) + + def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPath): """Make sure that all classes and relations defined in the provided path exist in the provided model. @@ -48,52 +61,52 @@ def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPat f"Class {path_element.target} not found in model." ) - if path_element.type_ == RelationPathElementType.ACTIVE: - if ( - path_element.property - not in model.classes[path_element.source].relations - ): - raise ModelAssumptionError( - f"Relation property {path_element.property} not found in class" - f" {path_element.source}." - ) - - return + if path_element.type_ == RelationPathElementType.ACTIVE and ( + path_element.property not in model.classes[path_element.source].relations + ): + raise ModelAssumptionError( + f"Relation property {path_element.property} not found in class" + f" {path_element.source}." + ) -def assert_new_property_not_exists( +def assert_summary_exists( schema: SchemaPack, - instructions_by_class: dict[str, list[AddCountPropertyInstruction]], + instruction: AddReferenceCountPropertyInstruction, +) -> None: + """TODO.""" + class_name = instruction.class_name + class_def = schema.classes.get(class_name) + + # Check if the class exists in the model + if not class_def: + raise ModelAssumptionError(f"Class {class_name} does not exist in the model.") + + # Check if the object_path already exists in the model + try: + target_schema = resolve_schema_object_path( + json_schema=class_def.content.json_schema_dict, + path=instruction.target_content.object_path, + ) + except KeyError as err: + raise ModelAssumptionError( + f"Object path {instruction.target_content.object_path} does not exist" + + f" in class {class_name}." + ) from err + if instruction.target_content.property_name in target_schema.get("properties", {}): + raise ModelAssumptionError( + f"Property {instruction.target_content.property_name} already exists" + + f" in class {class_name}." + ) + + +def check_model_assumptions( + schema: SchemaPack, instructions: list[AddReferenceCountPropertyInstruction] ) -> None: - """Check the model assumptions for the add content properties transformation.""" - # the existence of the class getting the new property is already checked in the previous assumption. - for class_name, instructions in instructions_by_class.items(): - # class_def = schema.classes.get(class_name) - - # # Check if the class exists in the model - # if not class_def: - # raise ModelAssumptionError( - # f"Class {class_name} does not exist in the model." - # ) - - for instruction in instructions: - # Check if the property already exists in the target schema - try: - target_schema = resolve_schema_object_path( - json_schema=class_def.content.json_schema_dict, - path=instruction.target_content.object_path, - ) - except KeyError: - continue - if instruction.target_content.property_name in target_schema.get( - "properties", {} - ): - raise ModelAssumptionError( - f"Property {instruction.target_content.property_name} already exists" - + f" in class {class_name}." - ) - - -def check_model_assumptions(schema: SchemaPack, instructions_by_class: Any) -> None: """Check the model assumptions for the count references transformation.""" - return None + for instruction in instructions: + assert_class_is_source(schema, instruction) + assert_path_classes_and_relations_exist( + schema, instruction.source_relation_path + ) + assert_summary_exists(schema, instruction) diff --git a/src/metldata/builtin_transformations/count_references/config.py b/src/metldata/builtin_transformations/count_references/config.py index 4aec28e..a2c6639 100644 --- a/src/metldata/builtin_transformations/count_references/config.py +++ b/src/metldata/builtin_transformations/count_references/config.py @@ -19,14 +19,18 @@ from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) + class CountReferencesConfig(BaseSettings): """Config containing content properties to be deleted from models and data.""" model_config = SettingsConfigDict(extra="forbid") - count_references: Any = Field( + count_references: list[AddReferenceCountPropertyInstruction] = Field( ..., - description=("description"), + description=("Description TODO"), examples=[], ) diff --git a/src/metldata/builtin_transformations/count_references/instruction.py b/src/metldata/builtin_transformations/count_references/instruction.py index 4456dbb..3e81ed5 100644 --- a/src/metldata/builtin_transformations/count_references/instruction.py +++ b/src/metldata/builtin_transformations/count_references/instruction.py @@ -14,46 +14,25 @@ # limitations under the License. """Models for instructions used in the 'add content properties' transformation.""" -from typing import Any, Final - from pydantic import Field from pydantic_settings import BaseSettings from metldata.builtin_transformations.common import NewContentSchemaPath - -DEFAULT_REFERENCE_COUNT_SCHEMA: Final[dict[str, Any]] = { - "type": "object", - "additionalProperties": False, - "properties": {"count": {"type": "integer"}}, - "required": ["count"], -} +from metldata.builtin_transformations.common.path.path import RelationPath -class AddCountPropertyInstruction(BaseSettings): - """A model describing an instruction to - TODO reference_count_schema don't have to be exposed - """ +class AddReferenceCountPropertyInstruction(BaseSettings): + """A model describing an instruction to""" class_name: str = Field(..., description="The name of the class to modify.") - target_content_path: NewContentSchemaPath = Field( + target_content: NewContentSchemaPath = Field( ..., description="A NewContentSchemaPath that describes a path to an already" + " existing object within the content schema and the name of a property to be" + " added to that object's schema", ) - - required: bool = Field( - True, - description=( - "Indicates whether the newly added property shall be added to the" - + " 'required' list of the corresponding object. Defaults to 'True'." - ), - ) - - reference_count_schema: dict[str, Any] = Field( - DEFAULT_REFERENCE_COUNT_SCHEMA, - description="The JSON schema of the newly added property.", + source_relation_path: RelationPath = Field( + ..., + description="The path describing the relation between the classes if a metadata model.", ) - - # value: Any = Field({}, description="Value needs to be calculated") diff --git a/src/metldata/builtin_transformations/count_references/main.py b/src/metldata/builtin_transformations/count_references/main.py index 4445a40..0e5cc3b 100644 --- a/src/metldata/builtin_transformations/count_references/main.py +++ b/src/metldata/builtin_transformations/count_references/main.py @@ -54,7 +54,7 @@ def check_model_assumptions_wrapper( ModelAssumptionError: if the model does not fulfill the assumptions. """ - check_model_assumptions(schema=model, instructions_by_class=config.count_references) + check_model_assumptions(schema=model, instructions=config.count_references) def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaPack: From 2acd6d319c6bebd0a219f68ca3855b0d8cb974f0 Mon Sep 17 00:00:00 2001 From: sbilge Date: Thu, 15 Aug 2024 13:35:55 +0000 Subject: [PATCH 04/15] model transformation fn added --- .../count_references/assumptions.py | 14 ++++--- .../count_references/model_transform.py | 42 +++++++++++++++++-- .../infer_relations/data_transform.py | 7 ++-- .../infer_relations/model_transform.py | 4 +- .../multiple/transformed.schemapack.yaml | 2 +- 5 files changed, 55 insertions(+), 14 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index 88de597..903c742 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -38,7 +38,8 @@ def assert_class_is_source( """Make sure that the source class is the one being modified with the count property""" if instruction.class_name != instruction.source_relation_path.source: raise ModelAssumptionError( - f"Class {instruction.class_name} does not correspond to the relation source {instruction.source_relation_path.source}." + f"Class {instruction.class_name} does not correspond to the relation source { + instruction.source_relation_path.source}." ) @@ -74,13 +75,14 @@ def assert_summary_exists( schema: SchemaPack, instruction: AddReferenceCountPropertyInstruction, ) -> None: - """TODO.""" + """Make sure that the source class (the class being modified) and the object_path exists in the model.""" class_name = instruction.class_name class_def = schema.classes.get(class_name) # Check if the class exists in the model if not class_def: - raise ModelAssumptionError(f"Class {class_name} does not exist in the model.") + raise ModelAssumptionError( + f"Class {class_name} does not exist in the model.") # Check if the object_path already exists in the model try: @@ -90,12 +92,14 @@ def assert_summary_exists( ) except KeyError as err: raise ModelAssumptionError( - f"Object path {instruction.target_content.object_path} does not exist" + f"Object path { + instruction.target_content.object_path} does not exist" + f" in class {class_name}." ) from err if instruction.target_content.property_name in target_schema.get("properties", {}): raise ModelAssumptionError( - f"Property {instruction.target_content.property_name} already exists" + f"Property { + instruction.target_content.property_name} already exists" + f" in class {class_name}." ) diff --git a/src/metldata/builtin_transformations/count_references/model_transform.py b/src/metldata/builtin_transformations/count_references/model_transform.py index e0db6be..44064e3 100644 --- a/src/metldata/builtin_transformations/count_references/model_transform.py +++ b/src/metldata/builtin_transformations/count_references/model_transform.py @@ -14,18 +14,29 @@ # limitations under the License. """Model transformation logic for the 'count references' transformation""" +from copy import deepcopy + from schemapack.spec.schemapack import ( - # ClassDefinition, + ClassDefinition, SchemaPack, ) +from metldata.builtin_transformations.add_content_properties.path import ( + resolve_schema_object_path, +) +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) +from metldata.transform.base import EvitableTransformationError + # from metldata.transform.base import EvitableTransformationError def add_count_references( - *, model: SchemaPack, instructions_by_class: dict[str, list[str]] + *, model: SchemaPack, instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]] ) -> SchemaPack: - """Delete content properties from a model. + """Add a new content property (target_content) to the class(es) subject to + transformation Args: model: @@ -34,5 +45,30 @@ def add_count_references( The model with the """ # TODO model transform logic for count references + updated_class_defs: dict[str, ClassDefinition] = {} + for class_name, cls_instructions in instructions_by_class.items(): + for class_name, cls_instructions in instructions_by_class.items(): + class_def = model.classes.get(class_name) + + if not class_def: + raise EvitableTransformationError() + + content_schema = class_def.content.json_schema_dict + + for cls_instruction in cls_instructions: + try: + target_object = resolve_schema_object_path( + content_schema, cls_instruction.target_content.object_path + ) + except KeyError as e: + raise EvitableTransformationError() from e + + if cls_instruction.target_content.property_name in content_schema.get( + "properties", {} + ): + raise EvitableTransformationError() + target_object.setdefault("properties", {})[ + cls_instruction.target_content.property_name + ] = deepcopy(cls_instruction.content_schema) return model diff --git a/src/metldata/builtin_transformations/infer_relations/data_transform.py b/src/metldata/builtin_transformations/infer_relations/data_transform.py index f941d24..64e307f 100644 --- a/src/metldata/builtin_transformations/infer_relations/data_transform.py +++ b/src/metldata/builtin_transformations/infer_relations/data_transform.py @@ -47,10 +47,10 @@ from schemapack.spec.custom_types import ResourceId from schemapack.spec.datapack import DataPack, Resource -from metldata.builtin_transformations.infer_relations.path.path import ( +from metldata.builtin_transformations.common.path.path import ( RelationPath, ) -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) @@ -133,7 +133,8 @@ def resolve_passive_path_element( target_resource_ids = set() for candidate_resource_id, candidate_resource in candidate_resources.items(): - relation = candidate_resource.relations.get(path_element.property, set()) + relation = candidate_resource.relations.get( + path_element.property, set()) if ( isinstance(relation, set) and source_resource_id in relation diff --git a/src/metldata/builtin_transformations/infer_relations/model_transform.py b/src/metldata/builtin_transformations/infer_relations/model_transform.py index 41c9efa..502fe35 100644 --- a/src/metldata/builtin_transformations/infer_relations/model_transform.py +++ b/src/metldata/builtin_transformations/infer_relations/model_transform.py @@ -24,8 +24,8 @@ SchemaPack, ) -from metldata.builtin_transformations.infer_relations.path.path import RelationPath -from metldata.builtin_transformations.infer_relations.path.path_elements import ( +from metldata.builtin_transformations.common.path.path import RelationPath +from metldata.builtin_transformations.common.path.path_elements import ( RelationPathElement, RelationPathElementType, ) diff --git a/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml index 9aa8c86..5da8d14 100644 --- a/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml +++ b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml @@ -4,7 +4,7 @@ classes: File: id: propertyName: alias - content: ../example_content_schemas/File.schema.json + content: ../../../example_content_schemas/File.schema.json Dataset: id: propertyName: alias From 17d93b7abde6ac86adbf17b04ae01aed85cc047e Mon Sep 17 00:00:00 2001 From: sbilge Date: Thu, 15 Aug 2024 14:40:23 +0000 Subject: [PATCH 05/15] model transformation fn added --- .../count_references/assumptions.py | 10 ++-- .../count_references/main.py | 3 -- .../count_references/model_transform.py | 52 +++++++------------ .../infer_relations/model_transform.py | 4 +- 4 files changed, 26 insertions(+), 43 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index 903c742..8f44a4e 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -15,7 +15,6 @@ "Assumptions for count references transformation" -from typing import Any from schemapack.spec.schemapack import SchemaPack @@ -31,10 +30,13 @@ ) from metldata.transform.base import ModelAssumptionError +# TODO one more vaidation is required: "The transformation shall validate whether the +# target is defined with multiplicity and fail otherwise" Multiplicity is defined on +# schemapack. Hence it should in model assumptions -def assert_class_is_source( - model: SchemaPack, instruction: AddReferenceCountPropertyInstruction -): + +def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction + ): """Make sure that the source class is the one being modified with the count property""" if instruction.class_name != instruction.source_relation_path.source: raise ModelAssumptionError( diff --git a/src/metldata/builtin_transformations/count_references/main.py b/src/metldata/builtin_transformations/count_references/main.py index 0e5cc3b..ea94e9c 100644 --- a/src/metldata/builtin_transformations/count_references/main.py +++ b/src/metldata/builtin_transformations/count_references/main.py @@ -27,9 +27,6 @@ from metldata.builtin_transformations.count_references.data_transform import ( count_references, ) -from metldata.builtin_transformations.count_references.model_transform import ( - add_count_references, -) from metldata.transform.base import DataTransformer, TransformationDefinition diff --git a/src/metldata/builtin_transformations/count_references/model_transform.py b/src/metldata/builtin_transformations/count_references/model_transform.py index 44064e3..15a2f57 100644 --- a/src/metldata/builtin_transformations/count_references/model_transform.py +++ b/src/metldata/builtin_transformations/count_references/model_transform.py @@ -14,10 +14,8 @@ # limitations under the License. """Model transformation logic for the 'count references' transformation""" -from copy import deepcopy from schemapack.spec.schemapack import ( - ClassDefinition, SchemaPack, ) @@ -29,46 +27,32 @@ ) from metldata.transform.base import EvitableTransformationError -# from metldata.transform.base import EvitableTransformationError - def add_count_references( *, model: SchemaPack, instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]] ) -> SchemaPack: - """Add a new content property (target_content) to the class(es) subject to - transformation - - Args: - model: - The model based on SchemaPack to - Returns: - The model with the + """The content properties are added to the model with the 'add_content_properties + step of the workflow. Thus, this function applies no transformation. + It only checks for EvitableTransformationError. """ - # TODO model transform logic for count references - updated_class_defs: dict[str, ClassDefinition] = {} for class_name, cls_instructions in instructions_by_class.items(): - for class_name, cls_instructions in instructions_by_class.items(): - class_def = model.classes.get(class_name) + class_def = model.classes.get(class_name) - if not class_def: - raise EvitableTransformationError() + if not class_def: + raise EvitableTransformationError() - content_schema = class_def.content.json_schema_dict + content_schema = class_def.content.json_schema_dict - for cls_instruction in cls_instructions: - try: - target_object = resolve_schema_object_path( - content_schema, cls_instruction.target_content.object_path - ) - except KeyError as e: - raise EvitableTransformationError() from e + for cls_instruction in cls_instructions: + try: + resolve_schema_object_path( + content_schema, cls_instruction.target_content.object_path + ) + except KeyError as e: + raise EvitableTransformationError() from e - if cls_instruction.target_content.property_name in content_schema.get( - "properties", {} - ): - raise EvitableTransformationError() - - target_object.setdefault("properties", {})[ - cls_instruction.target_content.property_name - ] = deepcopy(cls_instruction.content_schema) + if cls_instruction.target_content.property_name in content_schema.get( + "properties", {} + ): + raise EvitableTransformationError() return model diff --git a/src/metldata/builtin_transformations/infer_relations/model_transform.py b/src/metldata/builtin_transformations/infer_relations/model_transform.py index 502fe35..3a32fa9 100644 --- a/src/metldata/builtin_transformations/infer_relations/model_transform.py +++ b/src/metldata/builtin_transformations/infer_relations/model_transform.py @@ -50,7 +50,7 @@ def get_relation(element: RelationPathElement, schema: SchemaPack) -> Relation: return schema.classes[class_name].relations[element.property] -def infer_mutiplicity_from_path( +def infer_multiplicity_from_path( path: RelationPath, schema: SchemaPack ) -> MultipleRelationSpec: """Infer the multiplicity of an inferred relation based on the path. @@ -140,7 +140,7 @@ def add_inferred_relations( raise EvitableTransformationError() mandatory = infer_mandatory_from_path(instruction.path, model) - multiple = infer_mutiplicity_from_path(instruction.path, model) + multiple = infer_multiplicity_from_path(instruction.path, model) new_relation = Relation.model_validate( { "targetClass": instruction.target, From c0563702198c0204535d323688af758b318884fc Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 23 Aug 2024 09:33:18 +0000 Subject: [PATCH 06/15] model and data transformation for count references --- .../count_references/assumptions.py | 56 +++++++---- .../count_references/config.py | 15 ++- .../count_references/data_transform.py | 44 +++++++-- .../count_references/instruction.py | 2 +- .../count_references/main.py | 13 ++- src/metldata/transform/base.py | 4 + .../multiple/input.datapack.yaml | 58 +++++++++++ .../multiple/input.schemapack.yaml | 97 +++++++++++++++++++ .../multiple/transformed.schemapack.yaml | 3 +- 9 files changed, 257 insertions(+), 35 deletions(-) create mode 100644 tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml create mode 100644 tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index 8f44a4e..ed06c57 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -15,7 +15,6 @@ "Assumptions for count references transformation" - from schemapack.spec.schemapack import SchemaPack from metldata.builtin_transformations.add_content_properties.path import ( @@ -28,15 +27,10 @@ from metldata.builtin_transformations.count_references.instruction import ( AddReferenceCountPropertyInstruction, ) -from metldata.transform.base import ModelAssumptionError - -# TODO one more vaidation is required: "The transformation shall validate whether the -# target is defined with multiplicity and fail otherwise" Multiplicity is defined on -# schemapack. Hence it should in model assumptions +from metldata.transform.base import ModelAssumptionError, MultiplicityError -def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction - ): +def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction): """Make sure that the source class is the one being modified with the count property""" if instruction.class_name != instruction.source_relation_path.source: raise ModelAssumptionError( @@ -73,18 +67,31 @@ def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPat ) +def assert_multiplicity(model: SchemaPack, path: RelationPath): + """Make sure the target of the relation conributes multiple instances to the relation.""" + for path_element in path.elements: + if path_element.type_ == RelationPathElementType.ACTIVE: + relation = model.classes[path_element.source].relations[ + path_element.property + ] + if not relation.multiple.target: + raise MultiplicityError( + f"The target of the relation { + path_element.property} does not contribute multiple instances to the relation." + ) + + def assert_summary_exists( - schema: SchemaPack, + model: SchemaPack, instruction: AddReferenceCountPropertyInstruction, ) -> None: """Make sure that the source class (the class being modified) and the object_path exists in the model.""" class_name = instruction.class_name - class_def = schema.classes.get(class_name) + class_def = model.classes.get(class_name) # Check if the class exists in the model if not class_def: - raise ModelAssumptionError( - f"Class {class_name} does not exist in the model.") + raise ModelAssumptionError(f"Class {class_name} does not exist in the model.") # Check if the object_path already exists in the model try: @@ -98,21 +105,28 @@ def assert_summary_exists( instruction.target_content.object_path} does not exist" + f" in class {class_name}." ) from err - if instruction.target_content.property_name in target_schema.get("properties", {}): + + # Check if the propert_name already exists in the model + if instruction.target_content.property_name not in target_schema.get( + "properties", {} + ): raise ModelAssumptionError( f"Property { - instruction.target_content.property_name} already exists" + instruction.target_content.property_name} does not exist" + f" in class {class_name}." ) def check_model_assumptions( - schema: SchemaPack, instructions: list[AddReferenceCountPropertyInstruction] + schema: SchemaPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], ) -> None: """Check the model assumptions for the count references transformation.""" - for instruction in instructions: - assert_class_is_source(schema, instruction) - assert_path_classes_and_relations_exist( - schema, instruction.source_relation_path - ) - assert_summary_exists(schema, instruction) + for _, instructions in instructions_by_class.items(): + for instruction in instructions: + assert_class_is_source(instruction) + assert_path_classes_and_relations_exist( + schema, instruction.source_relation_path + ) + assert_multiplicity(schema, instruction.source_relation_path) + assert_summary_exists(schema, instruction) diff --git a/src/metldata/builtin_transformations/count_references/config.py b/src/metldata/builtin_transformations/count_references/config.py index a2c6639..b83beb1 100644 --- a/src/metldata/builtin_transformations/count_references/config.py +++ b/src/metldata/builtin_transformations/count_references/config.py @@ -14,8 +14,6 @@ # limitations under the License. """Models used to describe count content properties that shall be calculated and added.""" -from typing import Any - from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict @@ -34,3 +32,16 @@ class CountReferencesConfig(BaseSettings): description=("Description TODO"), examples=[], ) + + def instructions_by_class( + self, + ) -> dict[str, list[AddReferenceCountPropertyInstruction]]: + """Returns a dictionary of instructions by class (i.e. config for each class).""" + instructions_by_class: dict[ + str, list[AddReferenceCountPropertyInstruction] + ] = {} + for instruction in self.count_references: + instructions_by_class.setdefault(instruction.class_name, []).append( + instruction + ) + return instructions_by_class diff --git a/src/metldata/builtin_transformations/count_references/data_transform.py b/src/metldata/builtin_transformations/count_references/data_transform.py index 106dfe4..61091e6 100644 --- a/src/metldata/builtin_transformations/count_references/data_transform.py +++ b/src/metldata/builtin_transformations/count_references/data_transform.py @@ -17,21 +17,53 @@ from schemapack.spec.datapack import DataPack -# from metldata.transform.base import EvitableTransformationError +from metldata.builtin_transformations.common.path.path_elements import ( + RelationPathElementType, +) +from metldata.builtin_transformations.count_references.instruction import ( + AddReferenceCountPropertyInstruction, +) +from metldata.transform.base import EvitableTransformationError -def count_references(*, data: DataPack) -> DataPack: - """Count +def count_references( + *, + data: DataPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], +) -> DataPack: + """Given a data pack and a dictionary of instructions by class, + counts the references and adds the value to its corresponding content property. Args: data: - + The datapack to add the reference count values. + instructions_by_class: + A dictionary mapping class names to lists of instructions. Returns: - The data with + The data with the reference counts added. """ modified_data = data.model_copy(deep=True) + for class_name, instructions in instructions_by_class.items(): + resources = modified_data.resources.get(class_name) + + if not resources: + raise EvitableTransformationError() + + for instruction in instructions: + for path_element in instruction.source_relation_path.elements: + if path_element.type_ == RelationPathElementType.ACTIVE: + relation_slot = path_element.property + else: + raise EvitableTransformationError() + + for resource in resources.values(): + related_to = resource.relations.get(relation_slot) + + count = len(related_to) if related_to else 0 - # TODO modifications + resource.content[instruction.target_content.object_path].update( + {instruction.target_content.property_name: count} + ) return modified_data diff --git a/src/metldata/builtin_transformations/count_references/instruction.py b/src/metldata/builtin_transformations/count_references/instruction.py index 3e81ed5..771f4b8 100644 --- a/src/metldata/builtin_transformations/count_references/instruction.py +++ b/src/metldata/builtin_transformations/count_references/instruction.py @@ -34,5 +34,5 @@ class AddReferenceCountPropertyInstruction(BaseSettings): ) source_relation_path: RelationPath = Field( ..., - description="The path describing the relation between the classes if a metadata model.", + description="The path describing the relation between the classes of a metadata model.", ) diff --git a/src/metldata/builtin_transformations/count_references/main.py b/src/metldata/builtin_transformations/count_references/main.py index ea94e9c..2d5d660 100644 --- a/src/metldata/builtin_transformations/count_references/main.py +++ b/src/metldata/builtin_transformations/count_references/main.py @@ -27,6 +27,9 @@ from metldata.builtin_transformations.count_references.data_transform import ( count_references, ) +from metldata.builtin_transformations.count_references.model_transform import ( + add_count_references, +) from metldata.transform.base import DataTransformer, TransformationDefinition @@ -39,7 +42,9 @@ def transform(self, data: DataPack) -> DataPack: Args: data: The data as DataPack to be transformed. """ - return count_references(data=data) + return count_references( + data=data, instructions_by_class=self._config.instructions_by_class() + ) def check_model_assumptions_wrapper( @@ -51,7 +56,9 @@ def check_model_assumptions_wrapper( ModelAssumptionError: if the model does not fulfill the assumptions. """ - check_model_assumptions(schema=model, instructions=config.count_references) + check_model_assumptions( + schema=model, instructions_by_class=config.instructions_by_class() + ) def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaPack: @@ -62,7 +69,7 @@ def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaP if the transformation fails. """ return add_count_references( - model=model, instructions_by_class=config.count_references + model=model, instructions_by_class=config.instructions_by_class() ) diff --git a/src/metldata/transform/base.py b/src/metldata/transform/base.py index bbdcfd1..81a37c1 100644 --- a/src/metldata/transform/base.py +++ b/src/metldata/transform/base.py @@ -44,6 +44,10 @@ class ModelAssumptionError(RuntimeError): """Raised when assumptions made by transformation step about a model are not met.""" +class MultiplicityError(ModelAssumptionError): + """Raised when the relation does not conform the multiplicity.""" + + class ModelTransformationError(RuntimeError): """Raised when a transformation failed when applied to the schemapack-based model. This exception should only be raised when the error could not have been caught diff --git a/tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml new file mode 100644 index 0000000..e75b5de --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/input.datapack.yaml @@ -0,0 +1,58 @@ +datapack: 0.3.0 +resources: + File: + file_a: + content: + filename: file_a.fastq + format: FASTQ + checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8 + size: 12321 + file_b: + content: + filename: file_b.fastq + format: FASTQ + checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9 + size: 12314 + file_c: + content: + filename: file_c.fastq + format: FASTQ + checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed + size: 12123 + Dataset: + dataset_1: + content: + dac_contact: dac@example.org + file_summary: # <- + count: 0 + relations: + files: + - file_a + - file_b + - file_c + Sample: + sample_x: + content: + description: Some sample. + file_summary: # <- + count: 0 + relations: + files: + - file_a + - file_b + sample_y: + content: + file_summary: # <- + count: 0 + relations: + files: + - file_c + Experiment: + experiment_i: + content: + sample_summary: # <- + count: 0 + relations: + samples: + - sample_x + - sample_y diff --git a/tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml new file mode 100644 index 0000000..a800d6b --- /dev/null +++ b/tests/fixtures/example_transformations/count_references/multiple/input.schemapack.yaml @@ -0,0 +1,97 @@ +# a simple schemapack: +schemapack: 0.3.0 +classes: + File: + id: + propertyName: alias + content: ../../../example_content_schemas/File.schema.json + Dataset: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A dataset that is a collection of files.", + "properties": + { + "dac_contact": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: true + target: true + mandatory: + origin: false + target: true + Sample: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "A sample used to generate files in the context of an experiment.", + "properties": + { + "description": { "type": "string" }, + "file_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + files: + targetClass: File + multiple: + origin: false + target: true + mandatory: + origin: false + target: true + Experiment: + id: + propertyName: alias + content: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "description": "An experiment containing one or multiple samples.", + "properties": + { + "description": { "type": "string" }, + "sample_summary": + { + "type": "object", + "additionalProperties": false, + "properties": { "count": { "type": "integer" } }, + "required": ["count"], + }, + }, + "type": "object", + } + relations: + samples: + targetClass: Sample + multiple: + origin: false + target: true + mandatory: + origin: true + target: true diff --git a/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml index 5da8d14..a800d6b 100644 --- a/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml +++ b/tests/fixtures/example_transformations/count_references/multiple/transformed.schemapack.yaml @@ -53,8 +53,8 @@ classes: "properties": { "count": { "type": "integer" } }, "required": ["count"], }, - "type": "object", }, + "type": "object", } relations: files: @@ -83,7 +83,6 @@ classes: "properties": { "count": { "type": "integer" } }, "required": ["count"], }, - "type": "object", }, "type": "object", } From 1f35b3fc66afffc63bc042e1f582df6c0f57102a Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 23 Aug 2024 09:48:46 +0000 Subject: [PATCH 07/15] template update --- .devcontainer/Dockerfile | 2 +- .template/mandatory_files.txt | 1 + Dockerfile.debian | 48 +++++++++++++++++++++++++++++++ lock/requirements-dev-template.in | 4 +-- 4 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 Dockerfile.debian diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 7eb17e2..b21af00 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -8,7 +8,7 @@ ARG USER_GID=$USER_UID RUN if [ "$USER_GID" != "1000" ] || [ "$USER_UID" != "1000" ]; then groupmod --gid $USER_GID vscode && usermod --uid $USER_UID --gid $USER_GID vscode; fi # [Option] Install Node.js -ARG INSTALL_NODE="true" +ARG INSTALL_NODE="false" ARG NODE_VERSION="lts/*" RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi diff --git a/.template/mandatory_files.txt b/.template/mandatory_files.txt index 660a15e..fcfb0f8 100644 --- a/.template/mandatory_files.txt +++ b/.template/mandatory_files.txt @@ -23,6 +23,7 @@ lock/requirements-dev.txt lock/requirements.txt Dockerfile +Dockerfile.debian config_schema.json example_config.yaml LICENSE diff --git a/Dockerfile.debian b/Dockerfile.debian new file mode 100644 index 0000000..33dbe10 --- /dev/null +++ b/Dockerfile.debian @@ -0,0 +1,48 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## creating building container +FROM python:3.12-slim-bookworm AS builder +# update and install dependencies +RUN apt update +RUN apt upgrade -y +RUN pip install build +# copy code +COPY . /service +WORKDIR /service +# build wheel +RUN python -m build + +# creating running container +FROM python:3.12-slim-bookworm +# update and install dependencies +RUN apt update +RUN apt upgrade -y +# copy and install requirements and wheel +WORKDIR /service +COPY --from=builder /service/lock/requirements.txt /service +RUN pip install --no-deps -r requirements.txt +RUN rm requirements.txt +COPY --from=builder /service/dist/ /service +RUN pip install --no-deps *.whl +RUN rm *.whl +# create new user and execute as that user +RUN useradd --create-home appuser +WORKDIR /home/appuser +USER appuser +# set environment +ENV PYTHONUNBUFFERED=1 +# Please adapt to package name: +ENTRYPOINT ["my-microservice"] diff --git a/lock/requirements-dev-template.in b/lock/requirements-dev-template.in index dd81066..97ce387 100644 --- a/lock/requirements-dev-template.in +++ b/lock/requirements-dev-template.in @@ -1,7 +1,7 @@ # common requirements for development and testing of services pytest>=8.2 -pytest-asyncio>=0.23.6 +pytest-asyncio>=0.23.7 pytest-cov>=5 snakeviz>=2.2 logot>=1.3 @@ -29,4 +29,4 @@ setuptools>=69.5 # required since switch to pyproject.toml and pip-tools tomli_w>=1.0 -uv>=0.1.44 +uv>=0.2.13 From 95328d3805b8f7bf758f97c483fe1f70737aecbe Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 23 Aug 2024 09:56:09 +0000 Subject: [PATCH 08/15] formatting fixes --- .../count_references/model_transform.py | 7 ++++--- .../infer_relations/data_transform.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/model_transform.py b/src/metldata/builtin_transformations/count_references/model_transform.py index 15a2f57..4a45a72 100644 --- a/src/metldata/builtin_transformations/count_references/model_transform.py +++ b/src/metldata/builtin_transformations/count_references/model_transform.py @@ -14,7 +14,6 @@ # limitations under the License. """Model transformation logic for the 'count references' transformation""" - from schemapack.spec.schemapack import ( SchemaPack, ) @@ -29,11 +28,13 @@ def add_count_references( - *, model: SchemaPack, instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]] + *, + model: SchemaPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], ) -> SchemaPack: """The content properties are added to the model with the 'add_content_properties step of the workflow. Thus, this function applies no transformation. - It only checks for EvitableTransformationError. + It only checks for EvitableTransformationError. """ for class_name, cls_instructions in instructions_by_class.items(): class_def = model.classes.get(class_name) diff --git a/src/metldata/builtin_transformations/infer_relations/data_transform.py b/src/metldata/builtin_transformations/infer_relations/data_transform.py index 64e307f..27422f9 100644 --- a/src/metldata/builtin_transformations/infer_relations/data_transform.py +++ b/src/metldata/builtin_transformations/infer_relations/data_transform.py @@ -133,8 +133,7 @@ def resolve_passive_path_element( target_resource_ids = set() for candidate_resource_id, candidate_resource in candidate_resources.items(): - relation = candidate_resource.relations.get( - path_element.property, set()) + relation = candidate_resource.relations.get(path_element.property, set()) if ( isinstance(relation, set) and source_resource_id in relation From 55522cd11d6f07f6ebce4bfde0387a51fa54213d Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 23 Aug 2024 10:02:25 +0000 Subject: [PATCH 09/15] schemapack vversion update --- .pyproject_generation/pyproject_custom.toml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pyproject_generation/pyproject_custom.toml b/.pyproject_generation/pyproject_custom.toml index 6903989..021fdac 100644 --- a/.pyproject_generation/pyproject_custom.toml +++ b/.pyproject_generation/pyproject_custom.toml @@ -4,7 +4,7 @@ name = "metldata" version = "1.0.0" description = "metldata - A framework for handling metadata based on ETL, CQRS, and event sourcing." dependencies = [ - "schemapack == 2.0.0-alpha.3" + "schemapack == 2.0.0-alpha.4" ] [project.urls] diff --git a/pyproject.toml b/pyproject.toml index 40be2f4..d9ac0ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ name = "metldata" version = "1.0.0" description = "metldata - A framework for handling metadata based on ETL, CQRS, and event sourcing." dependencies = [ - "schemapack == 2.0.0-alpha.3", + "schemapack == 2.0.0-alpha.4", ] [project.license] From 51d2e1ef2a02dad823d142f2143c6832657ba7a5 Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 27 Aug 2024 12:22:00 +0000 Subject: [PATCH 10/15] passive path support is added --- .../count_references/assumptions.py | 68 +++++++++++++------ .../count_references/data_transform.py | 10 +-- .../count_references/multiple/config.yaml | 30 ++++---- 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index ed06c57..d8171ec 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -30,15 +30,50 @@ from metldata.transform.base import ModelAssumptionError, MultiplicityError -def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction): - """Make sure that the source class is the one being modified with the count property""" - if instruction.class_name != instruction.source_relation_path.source: +def validate_modification_class(path_element, expected_class_name): + """Check whether the class specified to be modified with the reference count + matches the source or target class in the provided `path_element`, depending on the + type of the relation path (i.e., active or passive). If the class does not match, + an exception is raised. + """ + modification_class_name = ( + path_element.source + if path_element.type_ == RelationPathElementType.ACTIVE + else path_element.target + ) + if expected_class_name != modification_class_name: + raise ModelAssumptionError( + f"Class { + expected_class_name} does not correspond to the relation source " + f"{modification_class_name}." + ) + + +def check_class_exists(model: SchemaPack, class_name: str) -> None: + """Check if a class exists in the model and raise an error if not""" + if class_name not in model.classes: + raise ModelAssumptionError(f"Class {class_name} not found in model.") + + +def check_relation_exists(model: SchemaPack, class_name: str, relation: str): + """Check if a relation exists in a class and raise an error if not""" + if relation not in model.classes[class_name].relations: raise ModelAssumptionError( - f"Class {instruction.class_name} does not correspond to the relation source { - instruction.source_relation_path.source}." + f"Relation property { + relation} not found in class {class_name}." ) +def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction): + """Ensure that the class being modified with the reference count property is the expected class. + This function iterates over the elements of the relation path in the given instruction + and validates that the class being modified with the reference count property matches + the class specified in the relation path. + """ + for path_element in instruction.source_relation_path.elements: + validate_modification_class(path_element, instruction.class_name) + + def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPath): """Make sure that all classes and relations defined in the provided path exist in the provided model. @@ -48,27 +83,18 @@ def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPat if the model does not fulfill the assumptions. """ for path_element in path.elements: - if path_element.source not in model.classes: - raise ModelAssumptionError( - f"Class {path_element.source} not found in model." - ) + check_class_exists(model, path_element.source) + check_class_exists(model, path_element.target) - if path_element.target not in model.classes: - raise ModelAssumptionError( - f"Class {path_element.target} not found in model." - ) + if path_element.type_ == RelationPathElementType.ACTIVE: + check_relation_exists(model, path_element.source, path_element.property) - if path_element.type_ == RelationPathElementType.ACTIVE and ( - path_element.property not in model.classes[path_element.source].relations - ): - raise ModelAssumptionError( - f"Relation property {path_element.property} not found in class" - f" {path_element.source}." - ) + if path_element.type_ == RelationPathElementType.PASSIVE: + check_relation_exists(model, path_element.target, path_element.property) def assert_multiplicity(model: SchemaPack, path: RelationPath): - """Make sure the target of the relation conributes multiple instances to the relation.""" + """Make sure the target of the relation contributes multiple instances to the relation.""" for path_element in path.elements: if path_element.type_ == RelationPathElementType.ACTIVE: relation = model.classes[path_element.source].relations[ diff --git a/src/metldata/builtin_transformations/count_references/data_transform.py b/src/metldata/builtin_transformations/count_references/data_transform.py index 61091e6..e0c67e5 100644 --- a/src/metldata/builtin_transformations/count_references/data_transform.py +++ b/src/metldata/builtin_transformations/count_references/data_transform.py @@ -17,9 +17,6 @@ from schemapack.spec.datapack import DataPack -from metldata.builtin_transformations.common.path.path_elements import ( - RelationPathElementType, -) from metldata.builtin_transformations.count_references.instruction import ( AddReferenceCountPropertyInstruction, ) @@ -52,13 +49,12 @@ def count_references( for instruction in instructions: for path_element in instruction.source_relation_path.elements: - if path_element.type_ == RelationPathElementType.ACTIVE: - relation_slot = path_element.property - else: - raise EvitableTransformationError() + relation_slot = path_element.property for resource in resources.values(): related_to = resource.relations.get(relation_slot) + if not related_to: + raise EvitableTransformationError() count = len(related_to) if related_to else 0 diff --git a/tests/fixtures/example_transformations/count_references/multiple/config.yaml b/tests/fixtures/example_transformations/count_references/multiple/config.yaml index bfdf28f..628aab9 100644 --- a/tests/fixtures/example_transformations/count_references/multiple/config.yaml +++ b/tests/fixtures/example_transformations/count_references/multiple/config.yaml @@ -14,18 +14,18 @@ # limitations under the License. count_references: -- class_name: Dataset - target_content: - object_path: "file_summary" - property_name: "count" - source_relation_path: "Dataset(files)>File" -- class_name: Sample - target_content: - object_path: "file_summary" - property_name: "count" - source_relation_path: "Sample(files)>File" -- class_name: Experiment - target_content: - object_path: "sample_summary" - property_name: "count" - source_relation_path: "Experiment(samples)>Sample" + - class_name: Dataset + target_content: + object_path: "file_summary" + property_name: "count" + source_relation_path: "Dataset(files)>File" + - class_name: Sample + target_content: + object_path: "file_summary" + property_name: "count" + source_relation_path: "File<(files)Sample" + - class_name: Experiment + target_content: + object_path: "sample_summary" + property_name: "count" + source_relation_path: "Experiment(samples)>Sample" From 279d810ffb03dd7f291ff203337ab359c3de475a Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 27 Aug 2024 14:24:40 +0200 Subject: [PATCH 11/15] Update src/metldata/builtin_transformations/count_references/config.py Co-authored-by: Thomas Zajac --- src/metldata/builtin_transformations/count_references/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metldata/builtin_transformations/count_references/config.py b/src/metldata/builtin_transformations/count_references/config.py index b83beb1..889c6d7 100644 --- a/src/metldata/builtin_transformations/count_references/config.py +++ b/src/metldata/builtin_transformations/count_references/config.py @@ -29,7 +29,7 @@ class CountReferencesConfig(BaseSettings): count_references: list[AddReferenceCountPropertyInstruction] = Field( ..., - description=("Description TODO"), + description=("A list of instructions describing for which class and corresponding relation path references should be counted."), examples=[], ) From e34220ccda69884629b89acc969ac5bdeb5e5b00 Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 27 Aug 2024 14:24:55 +0200 Subject: [PATCH 12/15] Update src/metldata/builtin_transformations/count_references/__init__.py Co-authored-by: Thomas Zajac --- .../builtin_transformations/count_references/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metldata/builtin_transformations/count_references/__init__.py b/src/metldata/builtin_transformations/count_references/__init__.py index ab70fc1..efe6907 100644 --- a/src/metldata/builtin_transformations/count_references/__init__.py +++ b/src/metldata/builtin_transformations/count_references/__init__.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""A transformation to count the references.""" +"""A transformation to count how often specific classes are referenced along given relation paths.""" # shortcuts: from metldata.builtin_transformations.count_references.main import ( # noqa: F401 From 6c3d6d7c06ca3ee66b694da7f56516a5d5b2b869 Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 27 Aug 2024 13:01:14 +0000 Subject: [PATCH 13/15] doc string updates --- .../count_references/assumptions.py | 4 ++-- .../count_references/config.py | 5 ++++- .../count_references/instruction.py | 6 +++++- src/metldata/transform/base.py | 14 +++++++++++--- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index d8171ec..5258eb2 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -125,12 +125,12 @@ def assert_summary_exists( json_schema=class_def.content.json_schema_dict, path=instruction.target_content.object_path, ) - except KeyError as err: + except KeyError as exc: raise ModelAssumptionError( f"Object path { instruction.target_content.object_path} does not exist" + f" in class {class_name}." - ) from err + ) from exc # Check if the propert_name already exists in the model if instruction.target_content.property_name not in target_schema.get( diff --git a/src/metldata/builtin_transformations/count_references/config.py b/src/metldata/builtin_transformations/count_references/config.py index 889c6d7..96f4c50 100644 --- a/src/metldata/builtin_transformations/count_references/config.py +++ b/src/metldata/builtin_transformations/count_references/config.py @@ -29,7 +29,10 @@ class CountReferencesConfig(BaseSettings): count_references: list[AddReferenceCountPropertyInstruction] = Field( ..., - description=("A list of instructions describing for which class and corresponding relation path references should be counted."), + description=( + "A list of instructions describing for which class and" + + " corresponding relation path references should be counted." + ), examples=[], ) diff --git a/src/metldata/builtin_transformations/count_references/instruction.py b/src/metldata/builtin_transformations/count_references/instruction.py index 771f4b8..85d8bf6 100644 --- a/src/metldata/builtin_transformations/count_references/instruction.py +++ b/src/metldata/builtin_transformations/count_references/instruction.py @@ -22,7 +22,11 @@ class AddReferenceCountPropertyInstruction(BaseSettings): - """A model describing an instruction to""" + """A model describing an instruction for adding a reference count property to the + content schema of a class. It defines the class to be modified, the target content + where the property will be added, and the relationship path that describes how the + classes are connected. + """ class_name: str = Field(..., description="The name of the class to modify.") diff --git a/src/metldata/transform/base.py b/src/metldata/transform/base.py index 81a37c1..946867e 100644 --- a/src/metldata/transform/base.py +++ b/src/metldata/transform/base.py @@ -45,7 +45,13 @@ class ModelAssumptionError(RuntimeError): class MultiplicityError(ModelAssumptionError): - """Raised when the relation does not conform the multiplicity.""" + """Raised when a relation in the model does not conform to the required multiplicity + constraints. It occurs when the actual cardinality of a relationship within a model + fails to meet the expected multiplicity criteria of a transformation. E.g., + in 'count references' transformation, the target of a relation is required to + contribute multiple instances(`target=True`) to the relation, and this error is raised + if that condition is not satisfied. + """ class ModelTransformationError(RuntimeError): @@ -212,7 +218,8 @@ def validate_step_references( continue if step.input not in steps: raise ValueError( - f"Step {step.input} referenced in step {step_name} is not defined." + f"Step {step.input} referenced in step { + step_name} is not defined." ) if not step_with_no_input_found: @@ -236,7 +243,8 @@ def validate_artifact_references(cls, values): for artifact_name, step_name in artifacts.items(): if step_name not in steps: raise ValueError( - f"Step {step_name} referenced in artifact {artifact_name} is not defined." + f"Step {step_name} referenced in artifact { + artifact_name} is not defined." ) return values From 8437a3f0666fbc928aaff40022046375cd2fa54c Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 27 Aug 2024 13:07:16 +0000 Subject: [PATCH 14/15] template adjustment --- .template/mandatory_files_ignore.txt | 1 - Dockerfile | 50 ++++++++++++++++++++++++++++ Dockerfile.debian | 2 +- 3 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 Dockerfile diff --git a/.template/mandatory_files_ignore.txt b/.template/mandatory_files_ignore.txt index 156fcd6..5ddc433 100644 --- a/.template/mandatory_files_ignore.txt +++ b/.template/mandatory_files_ignore.txt @@ -8,7 +8,6 @@ scripts/script_utils/fastapi_app_location.py -Dockerfile config_schema.json example_config.yaml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0c01c4d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,50 @@ +# Copyright 2021 - 2024 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# BASE: a base image with updated packages +FROM python:3.12-alpine AS base +RUN apk upgrade --no-cache --available + +# BUILDER: a container to build the service wheel +FROM base AS builder +RUN pip install build +COPY . /service +WORKDIR /service +RUN python -m build + +# DEP-BUILDER: a container to (build and) install dependencies +FROM base AS dep-builder +RUN apk update +RUN apk add build-base gcc g++ libffi-dev zlib-dev +RUN apk upgrade --available +WORKDIR /service +COPY --from=builder /service/lock/requirements.txt /service +RUN pip install --no-deps -r requirements.txt + +# RUNNER: a container to run the service +FROM base AS runner +WORKDIR /service +RUN rm -rf /usr/local/lib/python3.12 +COPY --from=dep-builder /usr/local/lib/python3.12 /usr/local/lib/python3.12 +COPY --from=builder /service/dist/ /service +RUN pip install --no-deps *.whl +RUN rm *.whl +RUN adduser -D appuser +WORKDIR /home/appuser +USER appuser +ENV PYTHONUNBUFFERED=1 + +# Please adapt to package name: +ENTRYPOINT ["metldata"] diff --git a/Dockerfile.debian b/Dockerfile.debian index 33dbe10..4cd862b 100644 --- a/Dockerfile.debian +++ b/Dockerfile.debian @@ -45,4 +45,4 @@ USER appuser # set environment ENV PYTHONUNBUFFERED=1 # Please adapt to package name: -ENTRYPOINT ["my-microservice"] +ENTRYPOINT ["metldata"] From 93a002e751abbfc8a10afb8701696662017f2fce Mon Sep 17 00:00:00 2001 From: Thomas Zajac Date: Wed, 28 Aug 2024 11:30:03 +0200 Subject: [PATCH 15/15] Assumption reordering (#78) --- .../count_references/assumptions.py | 110 ++++++++++-------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/src/metldata/builtin_transformations/count_references/assumptions.py b/src/metldata/builtin_transformations/count_references/assumptions.py index 5258eb2..b6df458 100644 --- a/src/metldata/builtin_transformations/count_references/assumptions.py +++ b/src/metldata/builtin_transformations/count_references/assumptions.py @@ -22,6 +22,7 @@ ) from metldata.builtin_transformations.common.path.path import RelationPath from metldata.builtin_transformations.common.path.path_elements import ( + RelationPathElement, RelationPathElementType, ) from metldata.builtin_transformations.count_references.instruction import ( @@ -30,7 +31,49 @@ from metldata.transform.base import ModelAssumptionError, MultiplicityError -def validate_modification_class(path_element, expected_class_name): +def check_model_assumptions( + schema: SchemaPack, + instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], +) -> None: + """Check the model assumptions for the count references transformation.""" + for _, instructions in instructions_by_class.items(): + for instruction in instructions: + assert_only_direct_relations(instruction) + assert_class_is_source(instruction) + assert_path_classes_and_relations_exist( + schema, instruction.source_relation_path + ) + assert_multiplicity(schema, instruction.source_relation_path) + assert_object_path_exists(schema, instruction) + + +def assert_only_direct_relations(instruction: AddReferenceCountPropertyInstruction): + """Ensure that only direct relations are suppported which should be the case if the + relation path only contains one path element. + """ + num_elements = len(instruction.source_relation_path.elements) + if num_elements != 1: + raise ModelAssumptionError( + f"The provided relation path { + instruction.source_relation_path.path_str}" + f"does not describe a direct relation, but contains { + num_elements} different relations" + ) + + +def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction): + """Ensure that the class being modified with the reference count property is the expected class. + This function iterates over the elements of the relation path in the given instruction + and validates that the class being modified with the reference count property matches + the class specified in the relation path. + """ + for path_element in instruction.source_relation_path.elements: + _validate_modification_class(path_element, instruction.class_name) + + +def _validate_modification_class( + path_element: RelationPathElement, expected_class_name: str +): """Check whether the class specified to be modified with the reference count matches the source or target class in the provided `path_element`, depending on the type of the relation path (i.e., active or passive). If the class does not match, @@ -49,31 +92,6 @@ def validate_modification_class(path_element, expected_class_name): ) -def check_class_exists(model: SchemaPack, class_name: str) -> None: - """Check if a class exists in the model and raise an error if not""" - if class_name not in model.classes: - raise ModelAssumptionError(f"Class {class_name} not found in model.") - - -def check_relation_exists(model: SchemaPack, class_name: str, relation: str): - """Check if a relation exists in a class and raise an error if not""" - if relation not in model.classes[class_name].relations: - raise ModelAssumptionError( - f"Relation property { - relation} not found in class {class_name}." - ) - - -def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction): - """Ensure that the class being modified with the reference count property is the expected class. - This function iterates over the elements of the relation path in the given instruction - and validates that the class being modified with the reference count property matches - the class specified in the relation path. - """ - for path_element in instruction.source_relation_path.elements: - validate_modification_class(path_element, instruction.class_name) - - def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPath): """Make sure that all classes and relations defined in the provided path exist in the provided model. @@ -83,14 +101,29 @@ def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPat if the model does not fulfill the assumptions. """ for path_element in path.elements: - check_class_exists(model, path_element.source) - check_class_exists(model, path_element.target) + _check_class_exists(model, path_element.source) + _check_class_exists(model, path_element.target) if path_element.type_ == RelationPathElementType.ACTIVE: - check_relation_exists(model, path_element.source, path_element.property) + _check_relation_exists(model, path_element.source, path_element.property) if path_element.type_ == RelationPathElementType.PASSIVE: - check_relation_exists(model, path_element.target, path_element.property) + _check_relation_exists(model, path_element.target, path_element.property) + + +def _check_class_exists(model: SchemaPack, class_name: str) -> None: + """Check if a class exists in the model and raise an error if not""" + if class_name not in model.classes: + raise ModelAssumptionError(f"Class {class_name} not found in model.") + + +def _check_relation_exists(model: SchemaPack, class_name: str, relation: str): + """Check if a relation exists in a class and raise an error if not""" + if relation not in model.classes[class_name].relations: + raise ModelAssumptionError( + f"Relation property { + relation} not found in class {class_name}." + ) def assert_multiplicity(model: SchemaPack, path: RelationPath): @@ -107,7 +140,7 @@ def assert_multiplicity(model: SchemaPack, path: RelationPath): ) -def assert_summary_exists( +def assert_object_path_exists( model: SchemaPack, instruction: AddReferenceCountPropertyInstruction, ) -> None: @@ -141,18 +174,3 @@ def assert_summary_exists( instruction.target_content.property_name} does not exist" + f" in class {class_name}." ) - - -def check_model_assumptions( - schema: SchemaPack, - instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]], -) -> None: - """Check the model assumptions for the count references transformation.""" - for _, instructions in instructions_by_class.items(): - for instruction in instructions: - assert_class_is_source(instruction) - assert_path_classes_and_relations_exist( - schema, instruction.source_relation_path - ) - assert_multiplicity(schema, instruction.source_relation_path) - assert_summary_exists(schema, instruction)