Skip to content

Commit

Permalink
model and data transformation for count references
Browse files Browse the repository at this point in the history
  • Loading branch information
sbilge committed Aug 23, 2024
1 parent 17d93b7 commit c056370
Show file tree
Hide file tree
Showing 9 changed files with 257 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

"Assumptions for count references transformation"


from schemapack.spec.schemapack import SchemaPack

from metldata.builtin_transformations.add_content_properties.path import (
Expand All @@ -28,15 +27,10 @@
from metldata.builtin_transformations.count_references.instruction import (
AddReferenceCountPropertyInstruction,
)
from metldata.transform.base import ModelAssumptionError

# TODO one more vaidation is required: "The transformation shall validate whether the
# target is defined with multiplicity and fail otherwise" Multiplicity is defined on
# schemapack. Hence it should in model assumptions
from metldata.transform.base import ModelAssumptionError, MultiplicityError


def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction
):
def assert_class_is_source(instruction: AddReferenceCountPropertyInstruction):
"""Make sure that the source class is the one being modified with the count property"""
if instruction.class_name != instruction.source_relation_path.source:
raise ModelAssumptionError(
Expand Down Expand Up @@ -73,18 +67,31 @@ def assert_path_classes_and_relations_exist(model: SchemaPack, path: RelationPat
)


def assert_multiplicity(model: SchemaPack, path: RelationPath):
"""Make sure the target of the relation conributes multiple instances to the relation."""
for path_element in path.elements:
if path_element.type_ == RelationPathElementType.ACTIVE:
relation = model.classes[path_element.source].relations[
path_element.property
]
if not relation.multiple.target:
raise MultiplicityError(
f"The target of the relation {
path_element.property} does not contribute multiple instances to the relation."
)


def assert_summary_exists(
schema: SchemaPack,
model: SchemaPack,
instruction: AddReferenceCountPropertyInstruction,
) -> None:
"""Make sure that the source class (the class being modified) and the object_path exists in the model."""
class_name = instruction.class_name
class_def = schema.classes.get(class_name)
class_def = model.classes.get(class_name)

# Check if the class exists in the model
if not class_def:
raise ModelAssumptionError(
f"Class {class_name} does not exist in the model.")
raise ModelAssumptionError(f"Class {class_name} does not exist in the model.")

# Check if the object_path already exists in the model
try:
Expand All @@ -98,21 +105,28 @@ def assert_summary_exists(
instruction.target_content.object_path} does not exist"
+ f" in class {class_name}."
) from err
if instruction.target_content.property_name in target_schema.get("properties", {}):

# Check if the propert_name already exists in the model
if instruction.target_content.property_name not in target_schema.get(
"properties", {}
):
raise ModelAssumptionError(
f"Property {
instruction.target_content.property_name} already exists"
instruction.target_content.property_name} does not exist"
+ f" in class {class_name}."
)


def check_model_assumptions(
schema: SchemaPack, instructions: list[AddReferenceCountPropertyInstruction]
schema: SchemaPack,
instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]],
) -> None:
"""Check the model assumptions for the count references transformation."""
for instruction in instructions:
assert_class_is_source(schema, instruction)
assert_path_classes_and_relations_exist(
schema, instruction.source_relation_path
)
assert_summary_exists(schema, instruction)
for _, instructions in instructions_by_class.items():
for instruction in instructions:
assert_class_is_source(instruction)
assert_path_classes_and_relations_exist(
schema, instruction.source_relation_path
)
assert_multiplicity(schema, instruction.source_relation_path)
assert_summary_exists(schema, instruction)
15 changes: 13 additions & 2 deletions src/metldata/builtin_transformations/count_references/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
# limitations under the License.
"""Models used to describe count content properties that shall be calculated and added."""

from typing import Any

from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

Expand All @@ -34,3 +32,16 @@ class CountReferencesConfig(BaseSettings):
description=("Description TODO"),
examples=[],
)

def instructions_by_class(
self,
) -> dict[str, list[AddReferenceCountPropertyInstruction]]:
"""Returns a dictionary of instructions by class (i.e. config for each class)."""
instructions_by_class: dict[
str, list[AddReferenceCountPropertyInstruction]
] = {}
for instruction in self.count_references:
instructions_by_class.setdefault(instruction.class_name, []).append(
instruction
)
return instructions_by_class
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,53 @@

from schemapack.spec.datapack import DataPack

# from metldata.transform.base import EvitableTransformationError
from metldata.builtin_transformations.common.path.path_elements import (
RelationPathElementType,
)
from metldata.builtin_transformations.count_references.instruction import (
AddReferenceCountPropertyInstruction,
)
from metldata.transform.base import EvitableTransformationError


def count_references(*, data: DataPack) -> DataPack:
"""Count
def count_references(
*,
data: DataPack,
instructions_by_class: dict[str, list[AddReferenceCountPropertyInstruction]],
) -> DataPack:
"""Given a data pack and a dictionary of instructions by class,
counts the references and adds the value to its corresponding content property.
Args:
data:
The datapack to add the reference count values.
instructions_by_class:
A dictionary mapping class names to lists of instructions.
Returns:
The data with
The data with the reference counts added.
"""
modified_data = data.model_copy(deep=True)
for class_name, instructions in instructions_by_class.items():
resources = modified_data.resources.get(class_name)

if not resources:
raise EvitableTransformationError()

for instruction in instructions:
for path_element in instruction.source_relation_path.elements:
if path_element.type_ == RelationPathElementType.ACTIVE:
relation_slot = path_element.property
else:
raise EvitableTransformationError()

for resource in resources.values():
related_to = resource.relations.get(relation_slot)

count = len(related_to) if related_to else 0

# TODO modifications
resource.content[instruction.target_content.object_path].update(
{instruction.target_content.property_name: count}
)

return modified_data
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ class AddReferenceCountPropertyInstruction(BaseSettings):
)
source_relation_path: RelationPath = Field(
...,
description="The path describing the relation between the classes if a metadata model.",
description="The path describing the relation between the classes of a metadata model.",
)
13 changes: 10 additions & 3 deletions src/metldata/builtin_transformations/count_references/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
from metldata.builtin_transformations.count_references.data_transform import (
count_references,
)
from metldata.builtin_transformations.count_references.model_transform import (
add_count_references,
)
from metldata.transform.base import DataTransformer, TransformationDefinition


Expand All @@ -39,7 +42,9 @@ def transform(self, data: DataPack) -> DataPack:
Args:
data: The data as DataPack to be transformed.
"""
return count_references(data=data)
return count_references(
data=data, instructions_by_class=self._config.instructions_by_class()
)


def check_model_assumptions_wrapper(
Expand All @@ -51,7 +56,9 @@ def check_model_assumptions_wrapper(
ModelAssumptionError:
if the model does not fulfill the assumptions.
"""
check_model_assumptions(schema=model, instructions=config.count_references)
check_model_assumptions(
schema=model, instructions_by_class=config.instructions_by_class()
)


def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaPack:
Expand All @@ -62,7 +69,7 @@ def transform_model(model: SchemaPack, config: CountReferencesConfig) -> SchemaP
if the transformation fails.
"""
return add_count_references(
model=model, instructions_by_class=config.count_references
model=model, instructions_by_class=config.instructions_by_class()
)


Expand Down
4 changes: 4 additions & 0 deletions src/metldata/transform/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ class ModelAssumptionError(RuntimeError):
"""Raised when assumptions made by transformation step about a model are not met."""


class MultiplicityError(ModelAssumptionError):
"""Raised when the relation does not conform the multiplicity."""


class ModelTransformationError(RuntimeError):
"""Raised when a transformation failed when applied to the schemapack-based model.
This exception should only be raised when the error could not have been caught
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
datapack: 0.3.0
resources:
File:
file_a:
content:
filename: file_a.fastq
format: FASTQ
checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8
size: 12321
file_b:
content:
filename: file_b.fastq
format: FASTQ
checksum: 2b5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92c9
size: 12314
file_c:
content:
filename: file_c.fastq
format: FASTQ
checksum: a9c24870071da03f78515e6197048f3a2172e90e597e9250cd01a0cb8f0986ed
size: 12123
Dataset:
dataset_1:
content:
dac_contact: dac@example.org
file_summary: # <-
count: 0
relations:
files:
- file_a
- file_b
- file_c
Sample:
sample_x:
content:
description: Some sample.
file_summary: # <-
count: 0
relations:
files:
- file_a
- file_b
sample_y:
content:
file_summary: # <-
count: 0
relations:
files:
- file_c
Experiment:
experiment_i:
content:
sample_summary: # <-
count: 0
relations:
samples:
- sample_x
- sample_y
Loading

0 comments on commit c056370

Please sign in to comment.