Skip to content

Commit

Permalink
migrated transformation handling
Browse files Browse the repository at this point in the history
  • Loading branch information
KerstenBreuer committed Dec 7, 2023
1 parent f77a70a commit 9446da6
Show file tree
Hide file tree
Showing 13 changed files with 403 additions and 290 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from schemapack.spec.datapack import DataPack
from schemapack.spec.schemapack import SchemaPack

from metldata.event_handling.models import SubmissionAnnotation
from metldata.schemapack_.builtin_transformations.null.config import NullConfig
from metldata.schemapack_.transform.base import (
DataTransformer,
Expand All @@ -41,14 +40,11 @@ class NullTransformer(DataTransformer[NullConfig]):
"""A Null transformer that returns the input model and data unchanged. Useful e.g.
for testing."""

def transform(
self, *, data: DataPack, annotation: SubmissionAnnotation
) -> DataPack:
def transform(self, data: DataPack) -> DataPack:
"""Transforms data.
Args:
data: The data as DataPack to be transformed.
annotation: The annotation on the data.
Raises:
DataTransformationError:
Expand Down
9 changes: 4 additions & 5 deletions src/metldata/schemapack_/transform/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
from schemapack.spec.datapack import DataPack
from schemapack.spec.schemapack import SchemaPack

from metldata.event_handling.models import SubmissionAnnotation

class ModelAssumptionError(RuntimeError):
"""Raised when assumptions made by transformation step about a model are not met."""


class ModelTransformationError(RuntimeError):
Expand Down Expand Up @@ -65,14 +67,11 @@ def __init__(
self._transformed_model = transformed_model

@abstractmethod
def transform(
self, *, data: DataPack, annotation: SubmissionAnnotation
) -> DataPack:
def transform(self, data: DataPack) -> DataPack:
"""Transforms data.
Args:
data: The data as DataPack to be transformed.
annotation: The annotation on the data.
Raises:
DataTransformationError:
Expand Down
79 changes: 34 additions & 45 deletions src/metldata/schemapack_/transform/handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@
"""Logic for handling Transformation."""

from pydantic import BaseModel, ConfigDict
from schemapack.spec.datapack import DataPack
from schemapack.spec.schemapack import SchemaPack
from schemapack.validation import SchemaPackValidator

from metldata.custom_types import Json
from metldata.event_handling.models import SubmissionAnnotation
from metldata.model_utils.essentials import MetadataModel
from metldata.model_utils.metadata_validator import MetadataValidator
from metldata.transform.base import (
from metldata.schemapack_.transform.base import (
Config,
TransformationDefinition,
WorkflowConfig,
Expand All @@ -38,7 +37,7 @@ class WorkflowConfigMismatchError(RuntimeError):
"""

def __init__(
self, workflow_definition: WorkflowDefinition, workflow_config: Config
self, workflow_definition: WorkflowDefinition, workflow_config: BaseModel
):
"""Initialize the error with the workflow definition and the config."""
message = (
Expand All @@ -56,7 +55,7 @@ def __init__(
self,
transformation_definition: TransformationDefinition[Config],
transformation_config: Config,
original_model: MetadataModel,
original_model: SchemaPack,
):
"""Initialize the TransformationHandler by checking the assumptions made on the
original model and transforming the model as described in the transformation
Expand All @@ -75,41 +74,39 @@ def __init__(
self.transformed_model = self._definition.transform_model(
self._original_model, self._config
)
self._metadata_transformer = self._definition.metadata_transformer_factory(
self._data_transformer = self._definition.data_transformer_factory(
config=self._config,
original_model=self._original_model,
transformed_model=self.transformed_model,
)

self._original_metadata_validator = MetadataValidator(
model=self._original_model
self._original_data_validator = SchemaPackValidator(
schemapack=self._original_model
)
self._transformed_metadata_validator = MetadataValidator(
model=self.transformed_model
self._transformed_data_validator = SchemaPackValidator(
schemapack=self.transformed_model
)

def transform_metadata(
self, metadata: Json, *, annotation: SubmissionAnnotation
) -> Json:
"""Transforms metadata using the transformation definition. Validates the
original metadata against the original model and the transformed metadata
def transform_data(self, data: DataPack) -> DataPack:
"""Transforms data using the transformation definition. Validates the
original data against the original model and the transformed data
against the transformed model.
Args:
metadata: The metadata to be transformed.
annotation: The annotation on the metadata.
data: The data to be transformed.
Raises:
MetadataTransformationError:
schemapack.exceptions.ValidationError:
If validation of input data or transformed data fails against the
original or transformed model, respectively.
DataTransformationError:
if the transformation fails.
"""
self._original_metadata_validator.validate(metadata)
transformed_metadata = self._metadata_transformer.transform(
metadata=metadata, annotation=annotation
)
self._transformed_metadata_validator.validate(transformed_metadata)
self._original_data_validator.validate(datapack=data)
transformed_data = self._data_transformer.transform(data=data)
self._transformed_data_validator.validate(datapack=transformed_data)

return transformed_metadata
return transformed_data


class ResolvedWorkflowStep(WorkflowStepBase):
Expand Down Expand Up @@ -147,7 +144,7 @@ def resolve_workflow_step(
step_name: str,
workflow_definition: WorkflowDefinition,
workflow_config: WorkflowConfig,
original_model: MetadataModel,
original_model: SchemaPack,
) -> ResolvedWorkflowStep:
"""Translates a workflow step given a workflow definition and a workflow config
into a resolved workflow step.
Expand All @@ -171,7 +168,7 @@ def resolve_workflow_step(

def resolve_workflow(
workflow_definition: WorkflowDefinition,
original_model: MetadataModel,
original_model: SchemaPack,
workflow_config: WorkflowConfig,
) -> ResolvedWorkflow:
"""Translates a workflow definition given an input model and a workflow config into
Expand Down Expand Up @@ -225,10 +222,10 @@ def __init__(
self,
workflow_definition: WorkflowDefinition,
workflow_config: WorkflowConfig,
original_model: MetadataModel,
original_model: SchemaPack,
):
"""Initialize the WorkflowHandler with a workflow deinition, a matching
config, and a metadata model. The workflow definition is translated into a
config, and a model. The workflow definition is translated into a
resolved workflow.
"""
self._resolved_workflow = resolve_workflow(
Expand All @@ -241,25 +238,17 @@ def __init__(
self._resolved_workflow
)

def run(
self, *, metadata: Json, annotation: SubmissionAnnotation
) -> dict[str, Json]:
"""Run the workflow definition on metadata and its annotation to generate
artifacts.
"""
transformed_metadata: dict[str, Json] = {}
def run(self, *, data: DataPack) -> dict[str, DataPack]:
"""Run the workflow definition on data to generate artifacts."""
transformed_data: dict[str, DataPack] = {}
for step_name in self._resolved_workflow.step_order:
step = self._resolved_workflow.steps[step_name]
input_metadata = (
metadata if step.input is None else transformed_metadata[step.input]
)
transformed_metadata[
step_name
] = step.transformation_handler.transform_metadata(
input_metadata, annotation=annotation
input_data = data if step.input is None else transformed_data[step.input]
transformed_data[step_name] = step.transformation_handler.transform_data(
input_data
)

return {
artifact_name: transformed_metadata[step_name]
artifact_name: transformed_data[step_name]
for artifact_name, step_name in self._resolved_workflow.artifacts.items()
}
95 changes: 0 additions & 95 deletions src/metldata/schemapack_/transform/main.py

This file was deleted.

16 changes: 16 additions & 0 deletions tests/schemapack_/fixtures/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Fixtures that are used in both integration and unit tests"""
34 changes: 34 additions & 0 deletions tests/schemapack_/fixtures/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
# for the German Human Genome-Phenome Archive (GHGA)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Valid and invalid metadata examples using the minimal model."""

from schemapack.load import load_datapack
from schemapack.spec.datapack import DataPack

from tests.schemapack_.fixtures.utils import BASE_DIR

EXAMPLE_DATA_DIR = BASE_DIR / "example_data"


def _get_example_data(name: str) -> DataPack:
"""Get example metadata."""

return load_datapack(EXAMPLE_DATA_DIR / f"{name}.datapack.yaml")


VALID_MINIMAL_DATA = _get_example_data("valid_minimal")
INVALID_MINIMAL_DATA = _get_example_data("invalid_minimal")
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Misses content property defined in the content schema:
datapack: 0.1.0
resources:
File:
example_file_a:
content:
alias: example_file_a
filename: example_file_a.fastq
format: FASTQ
checksum: 1a5ac10ab42911dc0224172c118a326d9a4c03969112a2f3eb1ad971e96e92b8
# missing size property
Dataset:
example_dataset:
content:
alias: example_dataset
dac_contact: dac@example.org
relations:
files:
- example_file_a
Loading

0 comments on commit 9446da6

Please sign in to comment.