From 85a27aced11843410e674ed9934fe8a1a0ac9c4b Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 15 Apr 2024 14:05:38 +0200
Subject: [PATCH 01/58] hopsworks_udf first version

---
 python/hsfs/hopsworks_udf.py | 264 +++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 python/hsfs/hopsworks_udf.py

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
new file mode 100644
index 0000000000..5be41a2ed5
--- /dev/null
+++ b/python/hsfs/hopsworks_udf.py
@@ -0,0 +1,264 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+import ast
+import inspect
+import warnings
+from typing import Callable, List, Union
+
+from hsfs import engine
+from hsfs.client.exceptions import FeatureStoreException
+
+
+def hopsworks_udf(return_type: Union[List[type], type]):
+    def wrapper(func: Callable):
+        udf = HopsworksUdf(func=func, return_type=return_type)
+        return udf
+
+    return wrapper
+
+
+class HopsworksUdf:
+    """
+    Metadata class to store information about UDF
+    """
+
+    PYTHON_SPARK_TYPE_MAPPING = {
+        str: "string",
+        int: "int",
+        float: "float",
+        # "timestamp": TimestampType(),
+        bool: "boolean",
+        # "date": DateType(),
+        # "binary": BinaryType(),
+    }
+
+    def __init__(
+        self, func: Callable, return_type: Union[List[type], type], name: str = None
+    ):
+        self.udf_function: Callable = func
+        if name is None:
+            self.function_name: str = func.__name__
+        else:
+            self.function_name: str = name
+        self.return_type: Union[List[type], type] = return_type
+        self.function_source: str = self._remove_argument(
+            HopsworksUdf._extract_source_code(self.udf_function), "statistics"
+        )
+        # TODO : Add a getter functions
+        self.transformation_features: List[str] = (
+            HopsworksUdf._extract_function_arguments(self.function_source)
+        )
+        HopsworksUdf.validate_arguments(self.return_type)
+
+    def get_transformation_features(self):
+        return self.transformation_features
+
+    @staticmethod
+    def validate_arguments(return_type):
+        if isinstance(return_type, list):
+            for python_type in return_type:
+                if not isinstance(python_type, type):
+                    raise FeatureStoreException(
+                        f'Return types provided must be a python type or a list of python types. "{python_type}" is not python type'
+                    )
+        else:
+            if not isinstance(return_type, type):
+                raise FeatureStoreException(
+                    f'Return types provided must be a python type or a list of python types. "{return_type}" is not python type or a list'
+                )
+
+    @staticmethod
+    def _get_module_imports(path):
+        imports = []
+        with open(path) as fh:
+            root = ast.parse(fh.read(), path)
+
+        for node in ast.iter_child_nodes(root):
+            if isinstance(node, ast.Import):
+                imported_module = False
+            elif isinstance(node, ast.ImportFrom):
+                imported_module = node.module
+            else:
+                continue
+
+            for n in node.names:
+                if imported_module:
+                    import_line = "from " + imported_module + " import " + n.name
+                elif n.asname:
+                    import_line = "import " + n.name + " as " + n.asname
+                else:
+                    import_line = "import " + n.name
+                imports.append(import_line)
+        return imports
+
+    @staticmethod
+    def _get_module_path(module_name):
+        def _get_module_path(module):
+            return module.__file__
+
+        module_path = {}
+        exec(
+            f'import {module_name}\nmodule_path["path"] = _get_module_path({module_name})'
+        )
+        return module_path["path"]
+
+    @staticmethod
+    def _extract_source_code(udf_function):
+        if not callable(udf_function):
+            # TODO : Think about a better text for the raised error
+            raise ValueError("transformation function must be callable")
+
+        try:
+            module_imports = HopsworksUdf._get_module_imports(
+                HopsworksUdf._get_module_path(udf_function.__module__)
+            )
+        except Exception:
+            module_imports = ""
+            # TODO : Check if warning is actually required.
+            warnings.warn(
+                "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.",
+                stacklevel=2,
+            )
+
+        function_code = inspect.getsource(udf_function)
+        source_code = "\n".join(module_imports) + "\n" + function_code
+
+        return source_code
+
+    @staticmethod
+    def _extract_function_arguments(source_code):
+        # Get source code of the original function
+        source_code = source_code.split("\n")
+
+        # Find the line where the function signature is defined
+        for i, line in enumerate(source_code):
+            if line.strip().startswith("def "):
+                signature_line = i
+                break
+
+        # Parse the function signature to remove the specified argument
+        signature = source_code[signature_line]
+        arg_list = signature.split("(")[1].split(")")[0].split(",")
+        arg_list = [arg.strip() for arg in arg_list]
+        return arg_list
+
+    def _remove_argument(self, source_code: str, arg_to_remove: str):
+        """ "
+        Function to remove statistics arguments from passed udf and type hinting.
+        Statistics arguments are removed since pandas UDF's do not accept extra arguments.
+        Statistics parameters are dynamically injected into the function scope.
+        """
+
+        # Get source code of the original function
+        source_code = source_code.split("\n")
+
+        # Find the line where the function signature is defined
+        for i, line in enumerate(source_code):
+            if line.strip().startswith("def "):
+                signature_line = i
+                break
+
+        # Parse the function signature to remove the specified argument
+        signature = source_code[signature_line]
+        arg_list = signature.split("(")[1].split(")")[0].split(",")
+        arg_list = [
+            arg.split(":")[0].strip()
+            for arg in arg_list
+            if (
+                arg_to_remove not in list(map(str.strip, arg.split(" ")))
+                and arg_to_remove not in list(map(str.strip, arg.split(":")))
+                and arg.strip() != arg_to_remove
+            )
+        ]
+
+        # Reconstruct the function signature
+        new_signature = (
+            signature.split("(")[0]
+            + "("
+            + ", ".join(arg_list)
+            + ")"
+            + signature.split(")")[1]
+        )
+
+        # Modify the source code to reflect the changes
+        source_code[signature_line] = new_signature
+
+        # Removing test before function signatre since they are decorators
+        source_code = source_code[signature_line:]
+
+        # Reconstruct the modified function as a string
+        modified_source = "\n".join(source_code)
+
+        # Define a new function with the modified source code
+        return modified_source
+
+    @staticmethod
+    def get_spark_type(python_type: type):
+        return HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[python_type]
+
+    def create_pandas_udf_return_schema_from_list(self, return_types: List[type]):
+        return ", ".join(
+            [
+                f'`{self.function_name}({",".join(self.transformation_features)})_{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}'
+                for i in range(len(return_types))
+            ]
+        )
+
+    def hopsworksUdf_wrapper(self, **statistics):
+        # TODO : clean this up
+        if isinstance(self.return_type, List):
+            self.function_source = "\t".join(self.function_source.splitlines(True))
+            self.code = f"""def renaming_wrapper(*args):
+    import pandas as pd
+    {self.function_source}
+    df = {self.function_name}(*args)
+    #raise Exception({{f'{{df.columns[i]}}':f'{self.function_name}{",".join(self.transformation_features)}_{{i}}' for i in range(len(df.columns))}})
+    df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}({",".join(self.transformation_features)})_{{i}}' for i in range(len(df.columns))}})
+    return df"""
+        else:
+            self.code = self.function_source
+        scope = __import__("__main__").__dict__
+        scope.update(**statistics)
+        exec(self.code, scope)
+        if isinstance(self.transformation_features, List):
+            return eval("renaming_wrapper", scope)
+        else:
+            return eval(self.function_name, scope)
+
+    def __call__(self, *args: List[str]):
+        for arg in args:
+            if not isinstance(arg, str):
+                raise FeatureStoreException(
+                    f'Feature names provided must be string "{arg}" is not string'
+                )
+
+        self.transformation_features = list(args)
+        return self
+
+    def get_udf(self, statistics):
+        if engine.get_type() in ["hive", "python", "training"]:
+            return self.hopsworksUdf_wrapper(statistics=statistics)
+        else:
+            from pyspark.sql.functions import pandas_udf
+
+            # TODO : Make this proper
+            return pandas_udf(
+                f=self.hopsworksUdf_wrapper(statistics=statistics),
+                returnType=self.create_pandas_udf_return_schema_from_list(
+                    self.return_type
+                ),
+            )

From 9e4478bab90b8ee386819520a826f3c6b8da64e2 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Wed, 17 Apr 2024 15:26:40 +0200
Subject: [PATCH 02/58] working code for running hopsworks udf without saving
 in backend using python client

---
 python/hsfs/constructor/query.py              |   2 +-
 python/hsfs/core/feature_view_engine.py       |  19 +-
 .../core/transformation_function_engine.py    |   6 +
 python/hsfs/engine/python.py                  |  89 ++++++---
 python/hsfs/engine/spark.py                   |  90 +++++----
 python/hsfs/feature_store.py                  |  10 +-
 python/hsfs/feature_view.py                   |  34 ++--
 python/hsfs/hopsworks_udf.py                  |   6 +-
 python/hsfs/training_dataset_feature.py       |  20 +-
 python/hsfs/transformation_function.py        | 176 +++---------------
 10 files changed, 186 insertions(+), 266 deletions(-)

diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py
index e305e8ca5a..5e527b6f13 100644
--- a/python/hsfs/constructor/query.py
+++ b/python/hsfs/constructor/query.py
@@ -59,7 +59,7 @@ def __init__(
             fg_mod.ExternalFeatureGroup,
             fg_mod.SpineGroup,
         ],
-        left_features: List[Union[str, "Feature"]],
+        left_features: List[Union[str, "Feature", Dict]],
         feature_store_name: Optional[str] = None,
         feature_store_id: Optional[int] = None,
         left_feature_group_start_time: Optional[Union[str, int, date, datetime]] = None,
diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index dd49fa5e21..770a772af6 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -120,9 +120,9 @@ def save(self, feature_view_obj):
                     )
                 )
 
-        self._transformation_function_engine.attach_transformation_fn(feature_view_obj)
+        # TODO : Remove this code portion attaches a transfromation function to a feature. This is not possible with the current implementation
+        # self._transformation_function_engine.attach_transformation_fn(feature_view_obj)
         updated_fv = self._feature_view_api.post(feature_view_obj)
-        self.attach_transformation_function(updated_fv)
         print(
             "Feature view created successfully, explore it at \n"
             + self._get_feature_view_url(updated_fv)
@@ -136,25 +136,10 @@ def update(self, feature_view_obj):
     def get(self, name, version=None):
         if version:
             fv = self._feature_view_api.get_by_name_version(name, version)
-            self.attach_transformation_function(fv)
         else:
             fv = self._feature_view_api.get_by_name(name)
-            for _fv in fv:
-                self.attach_transformation_function(_fv)
         return fv
 
-    def attach_transformation_function(self, fv: "feature_view.FeatureView"):
-        fv.transformation_functions = (
-            self._transformation_function_engine.get_fv_attached_transformation_fn(
-                fv.name, fv.version
-            )
-        )
-        if fv.transformation_functions:
-            for feature in fv.schema:
-                feature.transformation_function = fv.transformation_functions.get(
-                    feature.name, None
-                )
-
     def delete(self, name, version=None):
         if version:
             return self._feature_view_api.delete_by_name_version(name, version)
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 4d1db1df04..beeceb2bd8 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -111,6 +111,7 @@ def get_td_transformation_fn(self, training_dataset):
 
     @staticmethod
     def attach_transformation_fn(training_dataset_obj=None, feature_view_obj=None):
+        # TODO : Remove transformation function attached to training dataset object and features
         if training_dataset_obj:
             target_obj = training_dataset_obj  # todo why provide td and fv just to convert to target_obj?
         else:
@@ -150,6 +151,7 @@ def is_builtin(self, transformation_fn_instance):
     def populate_builtin_fn_arguments(
         feature_name, transformation_function_instance, feature_descriptive_stats
     ):
+        # TODO : Make this statistics
         if transformation_function_instance.name == "min_max_scaler":
             min_value, max_value = BuiltInTransformationFunction.min_max_scaler_stats(
                 feature_descriptive_stats, feature_name
@@ -194,6 +196,7 @@ def populate_builtin_fn_arguments(
     def populate_builtin_attached_fns(
         self, attached_transformation_fns, feature_descriptive_stats
     ):
+        # TODO : Remove
         for ft_name in attached_transformation_fns:
             if self.is_builtin(attached_transformation_fns[ft_name]):
                 # check if its built-in transformation function and populated with statistics arguments
@@ -207,6 +210,7 @@ def populate_builtin_attached_fns(
 
     @staticmethod
     def infer_spark_type(output_type):
+        # TODO : Move to hopsworks_udf
         if not output_type:
             return "STRING"  # STRING is default type for spark udfs
 
@@ -265,6 +269,8 @@ def compute_transformation_fn_statistics(
     def populate_builtin_transformation_functions(
         training_dataset, feature_view_obj, dataset
     ):
+        return
+        # TODO : Remove
         # check if there any transformation functions that require statistics attached to td features
         builtin_tffn_label_encoder_features = [
             ft_name
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 8e64e6ec95..9e256c322d 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -30,7 +30,7 @@
 from datetime import datetime, timezone
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
 
 import avro
 import boto3
@@ -48,7 +48,6 @@
     feature,
     feature_store,
     feature_view,
-    transformation_function_attached,
     util,
 )
 from hsfs import storage_connector as sc
@@ -81,6 +80,10 @@
 from tqdm.auto import tqdm
 
 
+if TYPE_CHECKING:
+    from hsfs.transformation_function import TransformationFunction
+
+
 # Disable pyhive INFO logging
 logging.getLogger("pyhive").setLevel(logging.WARNING)
 
@@ -893,6 +896,7 @@ def get_training_data(
             df = query_obj.read(
                 read_options=read_options, dataframe_type=dataframe_type
             )
+            # TODO : Add statistics
             transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(
                 training_dataset_obj, feature_view_obj, df
             )
@@ -1228,39 +1232,70 @@ def add_file(self, file: Optional[str]) -> Optional[str]:
 
     def _apply_transformation_function(
         self,
-        transformation_functions: Dict[
-            str, transformation_function_attached.TransformationFunctionAttached
-        ],
+        transformation_functions: List[TransformationFunction],
         dataset: Union[pd.DataFrame, pl.DataFrame],
     ) -> Union[pd.DataFrame, pl.DataFrame]:
-        for (
-            feature_name,
-            transformation_fn,
-        ) in transformation_functions.items():
+        transformed_features = set()
+        for transformation_function in transformation_functions:
+            hopsworks_udf = transformation_function.hopsworks_udf
+            missing_features = set(hopsworks_udf.transformation_features) - set(
+                dataset.columns
+            )
+
+            # TODO : Add documentation link in exception
+            if missing_features:
+                raise FeatureStoreException(
+                    f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .."
+                )
+
+            transformed_features.update(
+                transformation_function.hopsworks_udf.transformation_features
+            )
+
             if isinstance(dataset, pl.DataFrame) or isinstance(
                 dataset, pl.dataframe.frame.DataFrame
             ):
-                dataset = dataset.with_columns(
-                    pl.col(feature_name).map_elements(
-                        transformation_fn.transformation_fn
-                    )
-                )
+                pass
             else:
-                dataset[feature_name] = dataset[feature_name].map(
-                    transformation_fn.transformation_fn
-                )
-            # The below functions is not required for Polars since polars does have object types like pandas
-            if not (
-                isinstance(dataset, pl.DataFrame)
-                or isinstance(dataset, pl.dataframe.frame.DataFrame)
-            ):
-                offline_type = Engine.convert_spark_type_to_offline_type(
-                    transformation_fn.output_type
-                )
-                dataset[feature_name] = Engine._cast_column_to_offline_type(
-                    dataset[feature_name], offline_type
+                dataset = pd.concat(
+                    [
+                        dataset,
+                        transformation_function.hopsworks_udf.get_udf(statistics=None)(
+                            *(
+                                [
+                                    dataset[feature]
+                                    for feature in transformation_function.hopsworks_udf.transformation_features
+                                ]
+                            )
+                        ),
+                    ],
+                    axis=1,
                 )
+            # TODO : Think about what to do in cases where the output is a polars dataframe.....
+            # if isinstance(dataset, pl.DataFrame) or isinstance(
+            #    dataset, pl.dataframe.frame.DataFrame
+            # ):
+            #    dataset = dataset.with_columns(
+            #        pl.col(feature_name).map_elements(
+            #            transformation_fn.transformation_fn
+            #        )
+            #    )
+            # else:
+
+            # TODO : Think if below code is actually required
 
+            # The below functions is not required for Polars since polars does have object types like pandas
+            # if not (
+            #    isinstance(dataset, pl.DataFrame)
+            #    or isinstance(dataset, pl.dataframe.frame.DataFrame)
+            # ):
+            #    offline_type = Engine.convert_spark_type_to_offline_type(
+            #        transformation_fn.output_type
+            #    )
+            #    dataset[feature_name] = Engine._cast_column_to_offline_type(
+            #        dataset[feature_name], offline_type
+            #    )
+        dataset = dataset.drop(transformed_features, axis=1)
         return dataset
 
     @staticmethod
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index b9f8621cfc..0b69abecdd 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -23,13 +23,16 @@
 import shutil
 import warnings
 from datetime import date, datetime, timezone
-from typing import Any, List, Optional, TypeVar, Union
+from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING
 
 import avro
 import numpy as np
 import pandas as pd
 import tzlocal
 
+if TYPE_CHECKING:
+    from hsfs.transformation_function import TransformationFunction
+
 # in case importing in %%local
 from hsfs.core.vector_db_client import VectorDbClient
 
@@ -586,6 +589,7 @@ def write_training_dataset(
         feature_view_obj=None,
         to_df=False,
     ):
+        print("[SPARK] write_training_dataset")
         write_options = self.write_options(
             training_dataset.data_format, user_write_options
         )
@@ -810,6 +814,7 @@ def _write_training_dataset_single(
         path,
         to_df=False,
     ):
+        print("[SPARK] _write_training_dataset_single")
         # apply transformation functions (they are applied separately to each split)
         feature_dataframe = self._apply_transformation_function(
             transformation_functions, dataset=feature_dataframe
@@ -1162,23 +1167,42 @@ def add_cols_to_delta_table(self, feature_group, new_features):
             "spark.databricks.delta.schema.autoMerge.enabled", "true"
         ).save(feature_group.location)
 
-    def _apply_transformation_function(self, transformation_functions, dataset):
+    def _apply_transformation_function(
+        self, transformation_functions: List[TransformationFunction], dataset
+    ):
         # generate transformation function expressions
-        transformed_feature_names = []
-        transformation_fn_expressions = []
-        for (
-            feature_name,
-            transformation_fn,
-        ) in transformation_functions.items():
-            fn_registration_name = (
-                transformation_fn.name
-                + "_"
-                + str(transformation_fn.version)
-                + "_"
-                + feature_name
+        print("[SPARK] _apply_transformation_function")
+        transformed_features = set()
+        transformations = []
+        transformation_features = []
+        explode_name = []
+        for transformation_function in transformation_functions:
+            hopsworks_udf = transformation_function.hopsworks_udf
+            missing_features = set(hopsworks_udf.transformation_features) - set(
+                dataset.columns
+            )
+
+            # TODO : Add documentation link in exception
+            if missing_features:
+                raise FeatureStoreException(
+                    f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .."
+                )
+
+            transformed_features.update(
+                transformation_function.hopsworks_udf.transformation_features
             )
 
-            def timezone_decorator(func, trans_fn=transformation_fn):
+            # TODO : Add statistics
+            pandas_udf = hopsworks_udf.get_udf(None)
+            transformations.append(pandas_udf)
+            transformation_features.append(hopsworks_udf.transformation_features)
+
+            if isinstance(hopsworks_udf.return_type, List):
+                explode_name.append(
+                    f'{pandas_udf.__name__}({", ".join(hopsworks_udf.transformation_features)}).*'
+                )
+
+            def timezone_decorator(func, trans_fn=hopsworks_udf):
                 if trans_fn.output_type != "TIMESTAMP":
                     return func
 
@@ -1200,29 +1224,27 @@ def decorated_func(x):
 
                 return decorated_func
 
-            self._spark_session.udf.register(
-                fn_registration_name,
-                timezone_decorator(transformation_fn.transformation_fn),
-                transformation_fn.output_type,
-            )
-            transformation_fn_expressions.append(
-                "{fn_name:}({name:}) AS {name:}".format(
-                    fn_name=fn_registration_name, name=feature_name
-                )
-            )
-            transformed_feature_names.append(feature_name)
+            # TODO : Timezone aware check see if I need to do also.
+            # self._spark_session.udf.register(
+            #    fn_registration_name,
+            #    timezone_decorator(transformation_fn.transformation_fn),
+            #    transformation_fn.output_type,
+            # )
 
         # generate non transformation expressions
-        no_transformation_expr = [
-            "{name:} AS {name:}".format(name=col_name)
-            for col_name in dataset.columns
-            if col_name not in transformed_feature_names
-        ]
 
         # generate entire expression and execute it
-        transformation_fn_expressions.extend(no_transformation_expr)
-        transformed_dataset = dataset.selectExpr(*transformation_fn_expressions)
-        return transformed_dataset.select(*dataset.columns)
+
+        untransformed_columns = set(dataset.columns) - transformed_features
+        transformed_dataset = dataset.select(
+            *untransformed_columns,
+            *[
+                fun(*feature)
+                for fun, feature in zip(transformations, transformation_features)
+            ],
+        ).select(*untransformed_columns, *explode_name)
+
+        return transformed_dataset
 
     def _setup_gcp_hadoop_conf(self, storage_connector, path):
         PROPERTY_ENCRYPTION_KEY = "fs.gs.encryption.key"
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index c8a18dc6c0..24033bf11b 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -18,7 +18,7 @@
 
 import datetime
 import warnings
-from typing import Any, Dict, List, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
 
 import great_expectations as ge
 import humps
@@ -53,6 +53,10 @@
 from hsfs.transformation_function import TransformationFunction
 
 
+if TYPE_CHECKING:
+    from hsfs.hopsworks_udf import HopsworksUdf
+
+
 @typechecked
 class FeatureStore:
     DEFAULT_VERSION = 1
@@ -1464,7 +1468,9 @@ def create_feature_view(
         labels: Optional[List[str]] = None,
         inference_helper_columns: Optional[List[str]] = None,
         training_helper_columns: Optional[List[str]] = None,
-        transformation_functions: Optional[Dict[str, TransformationFunction]] = None,
+        transformation_functions: Optional[
+            List[Union[TransformationFunction, HopsworksUdf]]
+        ] = None,
     ) -> feature_view.FeatureView:
         """Create a feature view metadata object and saved it to hopsworks.
 
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 4f6a9dbb8e..82e45e4b2c 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -15,7 +15,6 @@
 #
 from __future__ import annotations
 
-import copy
 import json
 import logging
 import warnings
@@ -36,7 +35,6 @@
     util,
 )
 from hsfs import serving_key as skm
-from hsfs import transformation_function as tfm
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.constructor import filter, query
 from hsfs.constructor.filter import Filter, Logic
@@ -59,6 +57,11 @@
 from hsfs.statistics import Statistics
 from hsfs.statistics_config import StatisticsConfig
 from hsfs.training_dataset_split import TrainingDatasetSplit
+from hsfs.transformation_function import TransformationFunction
+
+
+if TYPE_CHECKING:
+    from hsfs.hopsworks_udf import HopsworksUdf
 
 
 _logger = logging.getLogger(__name__)
@@ -98,7 +101,7 @@ def __init__(
         inference_helper_columns: Optional[List[str]] = None,
         training_helper_columns: Optional[List[str]] = None,
         transformation_functions: Optional[
-            Dict[str, tfm.TransformationFunction]
+            List[Union[TransformationFunction, HopsworksUdf]]
         ] = None,
         featurestore_name: Optional[str] = None,
         serving_keys: Optional[List[skm.ServingKey]] = None,
@@ -119,14 +122,21 @@ def __init__(
         self._training_helper_columns = (
             training_helper_columns if training_helper_columns else []
         )
-        self._transformation_functions = (
-            {
-                ft_name: copy.deepcopy(transformation_functions[ft_name])
-                for ft_name in transformation_functions
-            }
-            if transformation_functions
-            else {}
+
+        # TODO : Clean this up
+        if transformation_functions:
+            for i, transformation_function in enumerate(transformation_functions):
+                if not isinstance(transformation_function, TransformationFunction):
+                    transformation_functions[i] = TransformationFunction(
+                        self.featurestore_id,
+                        hopsworks_udf=transformation_function,
+                        version=1,
+                    )
+
+        self._transformation_functions: List[TransformationFunction] = (
+            transformation_functions
         )
+
         self._features = []
         self._feature_view_engine: feature_view_engine.FeatureViewEngine = (
             feature_view_engine.FeatureViewEngine(featurestore_id)
@@ -3578,14 +3588,14 @@ def query(self, query_obj: "query.Query") -> None:
     @property
     def transformation_functions(
         self,
-    ) -> Dict[str, tfm.TransformationFunction]:
+    ) -> List[TransformationFunction]:
         """Get transformation functions."""
         return self._transformation_functions
 
     @transformation_functions.setter
     def transformation_functions(
         self,
-        transformation_functions: Dict[str, tfm.TransformationFunction],
+        transformation_functions: List[TransformationFunction],
     ) -> None:
         self._transformation_functions = transformation_functions
 
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 5be41a2ed5..b2a8bae274 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -213,7 +213,7 @@ def get_spark_type(python_type: type):
     def create_pandas_udf_return_schema_from_list(self, return_types: List[type]):
         return ", ".join(
             [
-                f'`{self.function_name}({",".join(self.transformation_features)})_{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}'
+                f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}'
                 for i in range(len(return_types))
             ]
         )
@@ -226,8 +226,7 @@ def hopsworksUdf_wrapper(self, **statistics):
     import pandas as pd
     {self.function_source}
     df = {self.function_name}(*args)
-    #raise Exception({{f'{{df.columns[i]}}':f'{self.function_name}{",".join(self.transformation_features)}_{{i}}' for i in range(len(df.columns))}})
-    df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}({",".join(self.transformation_features)})_{{i}}' for i in range(len(df.columns))}})
+    df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' for i in range(len(df.columns))}})
     return df"""
         else:
             self.code = self.function_source
@@ -240,6 +239,7 @@ def hopsworksUdf_wrapper(self, **statistics):
             return eval(self.function_name, scope)
 
     def __call__(self, *args: List[str]):
+        # TODO : Raise an execption if the number of features are incorrect.
         for arg in args:
             if not isinstance(arg, str):
                 raise FeatureStoreException(
diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py
index 6c3a04ea3d..c444e833c7 100644
--- a/python/hsfs/training_dataset_feature.py
+++ b/python/hsfs/training_dataset_feature.py
@@ -18,10 +18,8 @@
 import humps
 from hsfs import feature as feature_mod
 from hsfs import feature_group as feature_group_mod
-from hsfs import transformation_function as tf_mod
 from hsfs import util
 
-
 class TrainingDatasetFeature:
     def __init__(
         self,
@@ -33,7 +31,6 @@ def __init__(
         label=False,
         inference_helper_column=False,
         training_helper_column=False,
-        transformation_function=None,
         **kwargs,
     ):
         self._name = util.autofix_feature_name(name)
@@ -48,11 +45,6 @@ def __init__(
         self._label = label
         self._inference_helper_column = inference_helper_column
         self._training_helper_column = training_helper_column
-        self._transformation_function = (
-            tf_mod.TransformationFunction.from_response_json(transformation_function)
-            if isinstance(transformation_function, dict)
-            else transformation_function
-        )
 
     def to_dict(self):
         return {
@@ -62,7 +54,6 @@ def to_dict(self):
             "label": self._label,
             "inferenceHelperColumn": self._inference_helper_column,
             "trainingHelperColumn": self._training_helper_column,
-            "transformationFunction": self._transformation_function,
             "featureGroupFeatureName": self._feature_group_feature_name,
             "featuregroup": self._feature_group,
         }
@@ -127,15 +118,6 @@ def training_helper_column(self):
     def training_helper_column(self, training_helper_column):
         self._training_helper_column = training_helper_column
 
-    @property
-    def transformation_function(self):
-        """Set transformation functions."""
-        return self._transformation_function
-
-    @transformation_function.setter
-    def transformation_function(self, transformation_function):
-        self._transformation_function = transformation_function
-
     @property
     def feature_group(self):
         return self._feature_group
@@ -145,4 +127,4 @@ def feature_group_feature_name(self):
         return self._feature_group_feature_name
 
     def __repr__(self):
-        return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._transformation_function}, {self._feature_group_feature_name}, {self._feature_group.id!r})"
+        return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r})"
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index ffd88fd502..fee9f1f41e 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -14,26 +14,28 @@
 #
 from __future__ import annotations
 
-import ast
-import inspect
 import json
+from typing import TYPE_CHECKING, Optional
 
 import humps
 from hsfs import util
 from hsfs.core import transformation_function_engine
+from hsfs.decorators import typechecked
 
 
+if TYPE_CHECKING:
+    from hsfs.hopsworks_udf import HopsworksUdf
+
+
+@typechecked
 class TransformationFunction:
     def __init__(
         self,
-        featurestore_id,
-        transformation_fn=None,
-        version=None,
-        name=None,
-        source_code_content=None,
-        builtin_source_code=None,
-        output_type=None,
-        id=None,
+        featurestore_id: int,
+        hopsworks_udf: HopsworksUdf,
+        version: Optional[int] = None,
+        id: Optional[int] = None,
+        # TODO : Check if the below are actually needed
         type=None,
         items=None,
         count=None,
@@ -43,45 +45,16 @@ def __init__(
         self._id = id
         self._featurestore_id = featurestore_id
         self._version = version
-        self._name = name
-        self._transformation_fn = transformation_fn
-        self._source_code_content = source_code_content
 
         self._transformation_function_engine = (
             transformation_function_engine.TransformationFunctionEngine(
                 self._featurestore_id
             )
         )
-
-        # set up depending on user initialized
-        if self._transformation_fn is not None:
-            # type -> user init coming from user
-            self._transformer_code = None
-            self._extract_source_code()
-            self._output_type = self._transformation_function_engine.infer_spark_type(
-                output_type
-            )
-        elif builtin_source_code is not None:
-            # user triggered to register built-in transformation function
-            self._output_type = self._transformation_function_engine.infer_spark_type(
-                output_type
-            )
-            self._source_code_content = json.dumps(
-                {
-                    "module_imports": "",
-                    "transformer_code": builtin_source_code,
-                }
-            )
-        else:
-            # load backend response
-            # load original source code
-            self._output_type = self._transformation_function_engine.infer_spark_type(
-                output_type
-            )
-            self._load_source_code(self._source_code_content)
-
-        self._feature_group_feature_name = None
-        self._feature_group_id = None
+        self._hopsworks_udf = hopsworks_udf
+        self._name = hopsworks_udf.function_name
+        self._feature_group_feature_name: Optional[str] = None
+        self._feature_group_id: Optional[int] = None
 
     def save(self):
         """Persist transformation function in backend.
@@ -132,77 +105,6 @@ def plus_one(value):
         """
         self._transformation_function_engine.delete(self)
 
-    def _extract_source_code(self):
-        if not callable(self._transformation_fn):
-            raise ValueError("transformer must be callable")
-
-        self._name = self._transformation_fn.__name__
-
-        transformer_code = inspect.getsource(self._transformation_fn)
-
-        module_imports = self._get_module_imports(
-            self._get_module_path(self._transformation_fn.__module__)
-        )
-
-        self._transformer_code = "\n".join(module_imports) + "\n" + transformer_code
-
-        # initialise source code dict
-        # add all imports from module
-        # add original source code that will be used during offline transformations
-        self._source_code_content = json.dumps(
-            {
-                "module_imports": "\n".join(module_imports),
-                "transformer_code": transformer_code,
-            }
-        )
-
-    @staticmethod
-    def _get_module_path(module_name):
-        def _get_module_path(module):
-            return module.__file__
-
-        module_path = {}
-        exec(
-            """import %s\nmodule_path["path"] = _get_module_path(%s)"""
-            % (module_name, module_name)
-        )
-        return module_path["path"]
-
-    @staticmethod
-    def _get_module_imports(path):
-        imports = []
-        with open(path) as fh:
-            root = ast.parse(fh.read(), path)
-
-        for node in ast.iter_child_nodes(root):
-            if isinstance(node, ast.Import):
-                imported_module = False
-            elif isinstance(node, ast.ImportFrom):
-                imported_module = node.module
-            else:
-                continue
-
-            for n in node.names:
-                if imported_module:
-                    import_line = "from " + imported_module + " import " + n.name
-                elif n.asname:
-                    import_line = "import " + n.name + " as " + n.asname
-                else:
-                    import_line = "import " + n.name
-                imports.append(import_line)
-        return imports
-
-    def _load_source_code(self, source_code_content):
-        source_code_content = json.loads(source_code_content)
-        module_imports = source_code_content["module_imports"]
-        transformer_code = source_code_content["transformer_code"]
-        self._transformer_code = module_imports + "\n" * 2 + transformer_code
-
-        scope = __import__("__main__").__dict__
-        exec(self._transformer_code, scope)
-        self._transformation_fn = eval(self._name, scope)
-        self._transformation_fn._code = self._transformer_code
-
     @classmethod
     def from_response_json(cls, json_dict):
         json_decamelized = humps.decamelize(json_dict)
@@ -232,58 +134,30 @@ def to_dict(self):
         }
 
     @property
-    def id(self):
-        """Training dataset id."""
+    def id(self) -> id:
+        """Transformation function id."""
         return self._id
 
     @id.setter
-    def id(self, id):
+    def id(self, id: int):
         self._id = id
 
     @property
-    def name(self):
+    def name(self) -> str:
         return self._name
 
     @property
-    def version(self):
+    def version(self) -> int:
         return self._version
 
     @property
-    def transformer_code(self):
-        return self._transformer_code
-
-    @property
-    def transformation_fn(self):
-        return self._transformation_fn
-
-    @property
-    def source_code_content(self):
-        return self._source_code_content
-
-    @property
-    def output_type(self):
-        return self._output_type
+    def hopsworks_udf(self) -> HopsworksUdf:
+        return self._hopsworks_udf
 
     @name.setter
-    def name(self, name):
+    def name(self, name: str):
         self._name = name
 
     @version.setter
-    def version(self, version):
+    def version(self, version: int):
         self._version = version
-
-    @transformer_code.setter
-    def transformer_code(self, transformer_code):
-        self._transformer_code = transformer_code
-
-    @transformation_fn.setter
-    def transformation_fn(self, transformation_fn):
-        self._transformation_fn = transformation_fn
-
-    @source_code_content.setter
-    def source_code_content(self, source_code_content):
-        self._source_code_content = source_code_content
-
-    @output_type.setter
-    def output_type(self, output_type):
-        self._output_type = output_type

From 75441a3d8d6eac03b4c001d2ea2bdc7382ec7902 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Wed, 17 Apr 2024 15:43:10 +0200
Subject: [PATCH 03/58] removing debugging logs

---
 python/hsfs/engine/spark.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index 0b69abecdd..3e7e40a54a 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -589,7 +589,6 @@ def write_training_dataset(
         feature_view_obj=None,
         to_df=False,
     ):
-        print("[SPARK] write_training_dataset")
         write_options = self.write_options(
             training_dataset.data_format, user_write_options
         )
@@ -814,7 +813,6 @@ def _write_training_dataset_single(
         path,
         to_df=False,
     ):
-        print("[SPARK] _write_training_dataset_single")
         # apply transformation functions (they are applied separately to each split)
         feature_dataframe = self._apply_transformation_function(
             transformation_functions, dataset=feature_dataframe
@@ -1171,7 +1169,6 @@ def _apply_transformation_function(
         self, transformation_functions: List[TransformationFunction], dataset
     ):
         # generate transformation function expressions
-        print("[SPARK] _apply_transformation_function")
         transformed_features = set()
         transformations = []
         transformation_features = []

From 7af03f2cbe3f876713312bdd7e109102214ecc68 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 18 Apr 2024 15:12:42 +0200
Subject: [PATCH 04/58] statistics working with python client

---
 python/hsfs/core/feature_view_engine.py       |   2 +-
 .../core/transformation_function_engine.py    | 110 +++------
 python/hsfs/engine/python.py                  |   6 +-
 python/hsfs/engine/spark.py                   |   6 +-
 python/hsfs/feature_view.py                   |   8 +
 python/hsfs/hopsworks_udf.py                  | 211 +++++++++++++++---
 python/hsfs/transformation_function.py        |  16 +-
 7 files changed, 229 insertions(+), 130 deletions(-)

diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index 770a772af6..e954701d8e 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -121,7 +121,7 @@ def save(self, feature_view_obj):
                 )
 
         # TODO : Remove this code portion attaches a transfromation function to a feature. This is not possible with the current implementation
-        # self._transformation_function_engine.attach_transformation_fn(feature_view_obj)
+
         updated_fv = self._feature_view_api.post(feature_view_obj)
         print(
             "Feature view created successfully, explore it at \n"
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index beeceb2bd8..492567e6ec 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -109,38 +109,6 @@ def get_td_transformation_fn(self, training_dataset):
             )
         return transformation_fn_dict
 
-    @staticmethod
-    def attach_transformation_fn(training_dataset_obj=None, feature_view_obj=None):
-        # TODO : Remove transformation function attached to training dataset object and features
-        if training_dataset_obj:
-            target_obj = training_dataset_obj  # todo why provide td and fv just to convert to target_obj?
-        else:
-            target_obj = feature_view_obj
-
-        if target_obj._transformation_functions:
-            for (
-                feature_name,
-                transformation_fn,
-            ) in target_obj._transformation_functions.items():
-                if feature_name in target_obj.labels:
-                    raise ValueError(
-                        "Online transformations for training dataset labels are not supported."
-                    )
-
-                feature, prefix, featuregroup = target_obj.query._get_feature_by_name(
-                    feature_name
-                )
-                target_obj._features.append(
-                    training_dataset_feature.TrainingDatasetFeature(
-                        name=feature_name,
-                        feature_group_feature_name=feature.name,
-                        featuregroup=featuregroup,
-                        type=transformation_fn.output_type,
-                        label=False,
-                        transformation_function=transformation_fn,
-                    )
-                )
-
     def is_builtin(self, transformation_fn_instance):
         return (
             transformation_fn_instance.name in self.BUILTIN_FN_NAMES
@@ -249,6 +217,7 @@ def infer_spark_type(output_type):
         else:
             raise TypeError("Not supported type %s." % output_type)
 
+    # TODO : Think about what to do with label encoder features.
     @staticmethod
     def compute_transformation_fn_statistics(
         training_dataset_obj,
@@ -266,58 +235,35 @@ def compute_transformation_fn_statistics(
         )
 
     @staticmethod
-    def populate_builtin_transformation_functions(
-        training_dataset, feature_view_obj, dataset
-    ):
-        return
-        # TODO : Remove
-        # check if there any transformation functions that require statistics attached to td features
-        builtin_tffn_label_encoder_features = [
-            ft_name
-            for ft_name in training_dataset.transformation_functions
-            if training_dataset._transformation_function_engine.is_builtin(
-                training_dataset.transformation_functions[ft_name]
-            )
-            and training_dataset.transformation_functions[ft_name].name
-            == "label_encoder"
-        ]
-        builtin_tffn_features = [
-            ft_name
-            for ft_name in training_dataset.transformation_functions
-            if training_dataset._transformation_function_engine.is_builtin(
-                training_dataset.transformation_functions[ft_name]
+    def add_feature_statistics(training_dataset, feature_view_obj, dataset):
+        # TODO : Optimize this code portion check which i better computing all transformation feature statistics together or one by one.
+        statistics_features = set()
+        for transformation_function in feature_view_obj.transformation_functions:
+            statistics_features.update(
+                transformation_function.hopsworks_udf.statistics_features
             )
-            and training_dataset.transformation_functions[ft_name].name
-            != "label_encoder"
-        ]
 
-        if builtin_tffn_features or builtin_tffn_label_encoder_features:
-            if training_dataset.splits:
-                # compute statistics before transformations are applied
-                stats = (
-                    TransformationFunctionEngine.compute_transformation_fn_statistics(
-                        training_dataset,
-                        builtin_tffn_features,
-                        builtin_tffn_label_encoder_features,
-                        dataset.get(training_dataset.train_split),
-                        feature_view_obj,
-                    )
-                )
-            else:
-                # compute statistics before transformations are applied
-                stats = (
-                    TransformationFunctionEngine.compute_transformation_fn_statistics(
-                        training_dataset,
-                        builtin_tffn_features,
-                        builtin_tffn_label_encoder_features,
-                        dataset,
-                        feature_view_obj,
-                    )
-                )
-            # Populate builtin transformations (if any) with respective arguments
-            return training_dataset._transformation_function_engine.populate_builtin_attached_fns(
-                training_dataset.transformation_functions,
-                stats.feature_descriptive_statistics,
+        if training_dataset.splits:
+            # compute statistics before transformations are applied
+            stats = TransformationFunctionEngine.compute_transformation_fn_statistics(
+                training_dataset,
+                list(statistics_features),
+                [],
+                dataset.get(training_dataset.train_split),
+                feature_view_obj,
+            )
+        else:
+            # compute statistics before transformations are applied
+            stats = TransformationFunctionEngine.compute_transformation_fn_statistics(
+                training_dataset,
+                list(statistics_features),
+                [],
+                dataset,
+                feature_view_obj,
+            )
+        for transformation_function in feature_view_obj.transformation_functions:
+            transformation_function.hopsworks_udf.transformation_statistics = (
+                stats.feature_descriptive_statistics
             )
 
     def get_ready_to_use_transformation_fns(
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 9e256c322d..9754b96997 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -897,7 +897,7 @@ def get_training_data(
                 read_options=read_options, dataframe_type=dataframe_type
             )
             # TODO : Add statistics
-            transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(
+            transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
                 training_dataset_obj, feature_view_obj, df
             )
             return self._apply_transformation_function(
@@ -972,7 +972,7 @@ def _prepare_transform_split_df(
 
         # apply transformations
         # 1st parametrise transformation functions with dt split stats
-        transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(
+        transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
             training_dataset_obj, feature_view_obj, result_dfs
         )
         # and the apply them
@@ -1260,7 +1260,7 @@ def _apply_transformation_function(
                 dataset = pd.concat(
                     [
                         dataset,
-                        transformation_function.hopsworks_udf.get_udf(statistics=None)(
+                        transformation_function.hopsworks_udf.get_udf()(
                             *(
                                 [
                                     dataset[feature]
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index 3e7e40a54a..74c1a833e9 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -603,7 +603,7 @@ def write_training_dataset(
             else:
                 raise ValueError("Dataset should be a query.")
 
-            transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(
+            transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
                 training_dataset, feature_view_obj, dataset
             )
             if training_dataset.coalesce:
@@ -629,7 +629,7 @@ def write_training_dataset(
 
                 split_dataset[key] = split_dataset[key].cache()
 
-            transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions(
+            transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
                 training_dataset, feature_view_obj, split_dataset
             )
             return self._write_training_dataset_splits(
@@ -1190,7 +1190,7 @@ def _apply_transformation_function(
             )
 
             # TODO : Add statistics
-            pandas_udf = hopsworks_udf.get_udf(None)
+            pandas_udf = hopsworks_udf.get_udf()
             transformations.append(pandas_udf)
             transformation_features.append(hopsworks_udf.transformation_features)
 
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 82e45e4b2c..78e2101ed0 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -3412,6 +3412,12 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
             description=json_decamelized.get("description", None),
             featurestore_name=json_decamelized.get("featurestore_name", None),
             serving_keys=serving_keys,
+            transformation_functions=[
+                TransformationFunction.from_response_json(transformation)
+                for transformation in json_decamelized.get(
+                    "transformation_functions", []
+                )
+            ],
         )
         features = json_decamelized.get("features", [])
         if features:
@@ -3444,6 +3450,7 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView":
             "labels",
             "inference_helper_columns",
             "training_helper_columns",
+            "transformation_functions",
             "schema",
             "serving_keys",
         ]:
@@ -3483,6 +3490,7 @@ def to_dict(self) -> Dict[str, Any]:
             "description": self._description,
             "query": self._query,
             "features": self._features,
+            "transformation_functions": self._transformation_functions,
             "type": "featureViewDTO",
         }
 
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index b2a8bae274..5abaccedbe 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -16,11 +16,15 @@
 
 import ast
 import inspect
+import json
 import warnings
-from typing import Callable, List, Union
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
 
-from hsfs import engine
+import humps
+from hsfs import engine, util
 from hsfs.client.exceptions import FeatureStoreException
+from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 
 
 def hopsworks_udf(return_type: Union[List[type], type]):
@@ -31,11 +35,24 @@ def wrapper(func: Callable):
     return wrapper
 
 
+@dataclass
+class TransformationFeature:
+    feature_name: str
+    statistic_argument_name: Optional[str]
+
+    def to_dict(self):
+        return {
+            "feature_name": self.feature_name,
+            "statistic_argument_name": self.statistic_argument_name,
+        }
+
+
 class HopsworksUdf:
     """
     Metadata class to store information about UDF
     """
 
+    # TODO : Complete this
     PYTHON_SPARK_TYPE_MAPPING = {
         str: "string",
         int: "int",
@@ -46,21 +63,40 @@ class HopsworksUdf:
         # "binary": BinaryType(),
     }
 
+    STRING_PYTHON_TYPES_MAPPING = {"str": str, "int": int, "float": float, "bool": bool}
+
     def __init__(
-        self, func: Callable, return_type: Union[List[type], type], name: str = None
+        self,
+        func: Union[Callable, str],
+        return_type: Union[List[type], type],
+        name: str = None,
+        transformation_features: List[TransformationFeature] = None,
     ):
-        self.udf_function: Callable = func
         if name is None:
-            self.function_name: str = func.__name__
+            self._function_name: str = func.__name__
         else:
-            self.function_name: str = name
-        self.return_type: Union[List[type], type] = return_type
-        self.function_source: str = self._remove_argument(
-            HopsworksUdf._extract_source_code(self.udf_function), "statistics"
-        )
-        # TODO : Add a getter functions
-        self.transformation_features: List[str] = (
-            HopsworksUdf._extract_function_arguments(self.function_source)
+            self._function_name: str = name
+
+        self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = dict()
+
+        self._return_type: Union[List[type], type] = return_type
+
+        if isinstance(func, Callable):
+            self._function_source: str = HopsworksUdf._extract_source_code(func)
+        else:
+            self._function_source: str = func
+
+        if transformation_features:
+            self._transformation_features: List[TransformationFeature] = (
+                transformation_features
+            )
+        else:
+            self._transformation_features: List[TransformationFeature] = (
+                HopsworksUdf._extract_function_arguments(self.function_source)
+            )
+
+        self._function_source = self._remove_argument(
+            self.function_source, "statistics"
         )
         HopsworksUdf.validate_arguments(self.return_type)
 
@@ -118,10 +154,6 @@ def _get_module_path(module):
 
     @staticmethod
     def _extract_source_code(udf_function):
-        if not callable(udf_function):
-            # TODO : Think about a better text for the raised error
-            raise ValueError("transformation function must be callable")
-
         try:
             module_imports = HopsworksUdf._get_module_imports(
                 HopsworksUdf._get_module_path(udf_function.__module__)
@@ -153,8 +185,16 @@ def _extract_function_arguments(source_code):
         # Parse the function signature to remove the specified argument
         signature = source_code[signature_line]
         arg_list = signature.split("(")[1].split(")")[0].split(",")
-        arg_list = [arg.strip() for arg in arg_list]
-        return arg_list
+
+        arg_list = [arg.split(":")[0].strip() for arg in arg_list]
+
+        return [
+            TransformationFeature(
+                arg, f"statistics_{arg}" if f"statistics_{arg}" in arg_list else None
+            )
+            for arg in arg_list
+            if not arg.startswith("statistics")
+        ]
 
     def _remove_argument(self, source_code: str, arg_to_remove: str):
         """ "
@@ -181,6 +221,7 @@ def _remove_argument(self, source_code: str, arg_to_remove: str):
             if (
                 arg_to_remove not in list(map(str.strip, arg.split(" ")))
                 and arg_to_remove not in list(map(str.strip, arg.split(":")))
+                and arg_to_remove not in list(map(str.strip, arg.split("_")))
                 and arg.strip() != arg_to_remove
             )
         ]
@@ -218,47 +259,149 @@ def create_pandas_udf_return_schema_from_list(self, return_types: List[type]):
             ]
         )
 
-    def hopsworksUdf_wrapper(self, **statistics):
+    def hopsworksUdf_wrapper(self):
         # TODO : clean this up
+        function_source = "\t".join(self.function_source.splitlines(True))
         if isinstance(self.return_type, List):
-            self.function_source = "\t".join(self.function_source.splitlines(True))
-            self.code = f"""def renaming_wrapper(*args):
+            code = f"""def renaming_wrapper(*args):
     import pandas as pd
-    {self.function_source}
+    {function_source}
     df = {self.function_name}(*args)
     df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' for i in range(len(df.columns))}})
     return df"""
         else:
-            self.code = self.function_source
+            code = f"""def renaming_wrapper(*args):
+    import pandas as pd
+    {function_source}
+    df = {self.function_name}(*args)
+    df = df.rename(f'{self.function_name}<{"-".join(self.transformation_features)}>')
+    return df"""
         scope = __import__("__main__").__dict__
-        scope.update(**statistics)
-        exec(self.code, scope)
-        if isinstance(self.transformation_features, List):
-            return eval("renaming_wrapper", scope)
-        else:
-            return eval(self.function_name, scope)
+        scope.update(self.transformation_statistics)
+        exec(code, scope)
+        return eval("renaming_wrapper", scope)
 
     def __call__(self, *args: List[str]):
         # TODO : Raise an execption if the number of features are incorrect.
+        if len(args) != len(self.transformation_features):
+            raise FeatureStoreException(
+                "Number of features provided does not match the number of features provided in the UDF definition"
+            )
         for arg in args:
             if not isinstance(arg, str):
                 raise FeatureStoreException(
                     f'Feature names provided must be string "{arg}" is not string'
                 )
 
-        self.transformation_features = list(args)
+        self._transformation_features = [
+            TransformationFeature(
+                new_feature_name, transformation_feature.statistic_argument_name
+            )
+            for transformation_feature, new_feature_name in zip(
+                self._transformation_features, args
+            )
+        ]
         return self
 
-    def get_udf(self, statistics):
+    def get_udf(self):
         if engine.get_type() in ["hive", "python", "training"]:
-            return self.hopsworksUdf_wrapper(statistics=statistics)
+            return self.hopsworksUdf_wrapper()
         else:
             from pyspark.sql.functions import pandas_udf
 
             # TODO : Make this proper
             return pandas_udf(
-                f=self.hopsworksUdf_wrapper(statistics=statistics),
+                f=self.hopsworksUdf_wrapper(),
                 returnType=self.create_pandas_udf_return_schema_from_list(
                     self.return_type
                 ),
             )
+
+    def to_dict(self):
+        return {
+            "func": self.function_source,
+            "name": self.function_name,
+            "return_type": [python_type.__name__ for python_type in self.return_type]
+            if isinstance(self.return_type, List)
+            else self.return_type.__name__,
+            "transformation_features": self.transformation_features,
+        }
+
+    def json(self) -> str:
+        return json.dumps(self, cls=util.FeatureStoreEncoder)
+
+    @classmethod
+    def from_response_json(
+        cls: "HopsworksUdf", json_dict: Dict[str, Any]
+    ) -> "HopsworksUdf":
+        json_decamelized = humps.decamelize(json_dict)
+        function_source_code = json_decamelized["func"]
+        function_name = json_decamelized["name"]
+        return_type = json_decamelized["return_type"]
+        transformation_features = json_decamelized["transformation_features"]
+
+        hopsworks_udf = cls(
+            func=function_source_code,
+            return_type=[
+                cls.STRING_PYTHON_TYPES_MAPPING[python_type]
+                for python_type in return_type
+            ]
+            if isinstance(return_type, List)
+            else cls.STRING_PYTHON_TYPES_MAPPING[return_type],
+            name=function_name,
+            transformation_features=transformation_features,
+        )
+
+        return hopsworks_udf
+
+    @property
+    def return_type(self):
+        return self._return_type
+
+    @property
+    def function_name(self):
+        return self._function_name
+
+    @property
+    def function_source(self):
+        return self._function_source
+
+    @property
+    def statistics_required(self):
+        return bool(self.statistics_features)
+
+    @property
+    def transformation_statistics(self):
+        return self._statistics
+
+    @property
+    def transformation_features(self):
+        return [
+            transformation_feature.feature_name
+            for transformation_feature in self._transformation_features
+        ]
+
+    @property
+    def statistics_features(self):
+        return [
+            transformation_feature.feature_name
+            for transformation_feature in self._transformation_features
+            if transformation_feature.statistic_argument_name is not None
+        ]
+
+    @property
+    def statistics_argument_mapping(self):
+        return {
+            transformation_feature.feature_name: transformation_feature.statistic_argument_name
+            for transformation_feature in self._transformation_features
+        }
+
+    @transformation_statistics.setter
+    def transformation_statistics(self, statistics: List[FeatureDescriptiveStatistics]):
+        # TODO : Clean this up
+        self._statistics = dict()
+        for stat in statistics:
+            if stat.feature_name in self.statistics_argument_mapping.keys():
+                self._statistics[
+                    self.statistics_argument_mapping[stat.feature_name]
+                ] = stat
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index fee9f1f41e..a1549e50b3 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -15,16 +15,13 @@
 from __future__ import annotations
 
 import json
-from typing import TYPE_CHECKING, Optional
+from typing import Optional
 
 import humps
 from hsfs import util
 from hsfs.core import transformation_function_engine
 from hsfs.decorators import typechecked
-
-
-if TYPE_CHECKING:
-    from hsfs.hopsworks_udf import HopsworksUdf
+from hsfs.hopsworks_udf import HopsworksUdf
 
 
 @typechecked
@@ -108,6 +105,12 @@ def plus_one(value):
     @classmethod
     def from_response_json(cls, json_dict):
         json_decamelized = humps.decamelize(json_dict)
+
+        if json_decamelized.get("hopsworks_udf", False):
+            json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json(
+                json_decamelized["hopsworks_udf"]
+            )
+
         if "count" in json_decamelized:
             if json_decamelized["count"] == 0:
                 return []
@@ -128,9 +131,8 @@ def to_dict(self):
             "id": self._id,
             "name": self._name,
             "version": self._version,
-            "sourceCodeContent": self._source_code_content,
-            "outputType": self._output_type,
             "featurestoreId": self._featurestore_id,
+            "hopsworks_udf": self._hopsworks_udf,
         }
 
     @property

From 3ac5b26302ca7dbb42d50972ad1c058d6f02bc59 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 25 Apr 2024 10:53:02 +0200
Subject: [PATCH 05/58] basic functionality working with backend

---
 python/hsfs/engine/spark.py            | 15 ++++--
 python/hsfs/feature_view.py            |  2 +-
 python/hsfs/hopsworks_udf.py           | 71 ++++++++++++++------------
 python/hsfs/transformation_function.py |  2 +-
 4 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index 74c1a833e9..38867ea81e 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -1172,6 +1172,7 @@ def _apply_transformation_function(
         transformed_features = set()
         transformations = []
         transformation_features = []
+        output_col_names = []
         explode_name = []
         for transformation_function in transformation_functions:
             hopsworks_udf = transformation_function.hopsworks_udf
@@ -1191,13 +1192,15 @@ def _apply_transformation_function(
 
             # TODO : Add statistics
             pandas_udf = hopsworks_udf.get_udf()
+            output_col_name = f'{hopsworks_udf.function_name}<{"-".join(hopsworks_udf.transformation_features)}>'
             transformations.append(pandas_udf)
             transformation_features.append(hopsworks_udf.transformation_features)
+            output_col_names.append(output_col_name)
 
             if isinstance(hopsworks_udf.return_type, List):
-                explode_name.append(
-                    f'{pandas_udf.__name__}({", ".join(hopsworks_udf.transformation_features)}).*'
-                )
+                explode_name.append(f"{output_col_name}.*")
+            else:
+                explode_name.append(output_col_name)
 
             def timezone_decorator(func, trans_fn=hopsworks_udf):
                 if trans_fn.output_type != "TIMESTAMP":
@@ -1236,8 +1239,10 @@ def decorated_func(x):
         transformed_dataset = dataset.select(
             *untransformed_columns,
             *[
-                fun(*feature)
-                for fun, feature in zip(transformations, transformation_features)
+                fun(*feature).alias(output_col_name)
+                for fun, feature, output_col_name in zip(
+                    transformations, transformation_features, output_col_names
+                )
             ],
         ).select(*untransformed_columns, *explode_name)
 
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 78e2101ed0..386e3b256f 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -3490,7 +3490,7 @@ def to_dict(self) -> Dict[str, Any]:
             "description": self._description,
             "query": self._query,
             "features": self._features,
-            "transformation_functions": self._transformation_functions,
+            "transformationFunctions": self._transformation_functions,
             "type": "featureViewDTO",
         }
 
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 5abaccedbe..d23e1f45d2 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -15,9 +15,9 @@
 #
 
 import ast
+import copy
 import inspect
 import json
-import warnings
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -56,7 +56,7 @@ class HopsworksUdf:
     PYTHON_SPARK_TYPE_MAPPING = {
         str: "string",
         int: "int",
-        float: "float",
+        float: "double",
         # "timestamp": TimestampType(),
         bool: "boolean",
         # "date": DateType(),
@@ -161,10 +161,10 @@ def _extract_source_code(udf_function):
         except Exception:
             module_imports = ""
             # TODO : Check if warning is actually required.
-            warnings.warn(
-                "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.",
-                stacklevel=2,
-            )
+            # warnings.warn(
+            #    "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.",
+            #    stacklevel=2,
+            # )
 
         function_code = inspect.getsource(udf_function)
         source_code = "\n".join(module_imports) + "\n" + function_code
@@ -206,14 +206,18 @@ def _remove_argument(self, source_code: str, arg_to_remove: str):
         # Get source code of the original function
         source_code = source_code.split("\n")
 
+        signature_start_line = None
+        signature_end_line = None
         # Find the line where the function signature is defined
         for i, line in enumerate(source_code):
             if line.strip().startswith("def "):
-                signature_line = i
+                signature_start_line = i
+            if signature_start_line is not None and ")" in line:
+                signature_end_line = i
                 break
 
         # Parse the function signature to remove the specified argument
-        signature = source_code[signature_line]
+        signature = "".join(source_code[signature_start_line : signature_end_line + 1])
         arg_list = signature.split("(")[1].split(")")[0].split(",")
         arg_list = [
             arg.split(":")[0].strip()
@@ -234,15 +238,10 @@ def _remove_argument(self, source_code: str, arg_to_remove: str):
             + ")"
             + signature.split(")")[1]
         )
-
-        # Modify the source code to reflect the changes
-        source_code[signature_line] = new_signature
-
-        # Removing test before function signatre since they are decorators
-        source_code = source_code[signature_line:]
-
         # Reconstruct the modified function as a string
-        modified_source = "\n".join(source_code)
+        modified_source = (
+            new_signature + "\n" + "\n".join(source_code[signature_end_line + 1 :])
+        )
 
         # Define a new function with the modified source code
         return modified_source
@@ -252,12 +251,15 @@ def get_spark_type(python_type: type):
         return HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[python_type]
 
     def create_pandas_udf_return_schema_from_list(self, return_types: List[type]):
-        return ", ".join(
-            [
-                f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}'
-                for i in range(len(return_types))
-            ]
-        )
+        if isinstance(return_types, List):
+            return ", ".join(
+                [
+                    f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}'
+                    for i in range(len(return_types))
+                ]
+            )
+        else:
+            return f"{HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types]}"
 
     def hopsworksUdf_wrapper(self):
         # TODO : clean this up
@@ -292,8 +294,10 @@ def __call__(self, *args: List[str]):
                 raise FeatureStoreException(
                     f'Feature names provided must be string "{arg}" is not string'
                 )
-
-        self._transformation_features = [
+        udf = copy.deepcopy(
+            self
+        )  # TODO : Clean this copy is needed so that if the uses the same function to multiple feature, if copy not done then all variable would share the same traanformation feature,
+        udf._transformation_features = [
             TransformationFeature(
                 new_feature_name, transformation_feature.statistic_argument_name
             )
@@ -301,7 +305,7 @@ def __call__(self, *args: List[str]):
                 self._transformation_features, args
             )
         ]
-        return self
+        return udf
 
     def get_udf(self):
         if engine.get_type() in ["hive", "python", "training"]:
@@ -319,12 +323,12 @@ def get_udf(self):
 
     def to_dict(self):
         return {
-            "func": self.function_source,
-            "name": self.function_name,
-            "return_type": [python_type.__name__ for python_type in self.return_type]
+            "sourceCode": self.function_source,
+            "outputTypes": [python_type.__name__ for python_type in self.return_type]
             if isinstance(self.return_type, List)
             else self.return_type.__name__,
-            "transformation_features": self.transformation_features,
+            "transformationFeatures": self.transformation_features,
+            "name": self._function_name,
         }
 
     def json(self) -> str:
@@ -335,10 +339,10 @@ def from_response_json(
         cls: "HopsworksUdf", json_dict: Dict[str, Any]
     ) -> "HopsworksUdf":
         json_decamelized = humps.decamelize(json_dict)
-        function_source_code = json_decamelized["func"]
+        function_source_code = json_decamelized["source_code"]
         function_name = json_decamelized["name"]
-        return_type = json_decamelized["return_type"]
-        transformation_features = json_decamelized["transformation_features"]
+        return_type = json_decamelized["output_types"]
+        transformation_features = json_decamelized["transformation_features"].split(",")
 
         hopsworks_udf = cls(
             func=function_source_code,
@@ -349,10 +353,9 @@ def from_response_json(
             if isinstance(return_type, List)
             else cls.STRING_PYTHON_TYPES_MAPPING[return_type],
             name=function_name,
-            transformation_features=transformation_features,
         )
 
-        return hopsworks_udf
+        return hopsworks_udf(*transformation_features)
 
     @property
     def return_type(self):
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index a1549e50b3..a731d604e1 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -132,7 +132,7 @@ def to_dict(self):
             "name": self._name,
             "version": self._version,
             "featurestoreId": self._featurestore_id,
-            "hopsworks_udf": self._hopsworks_udf,
+            "hopsworksUdf": self._hopsworks_udf,
         }
 
     @property

From df5c9695e433f5b56409007f3726541f386a504e Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 25 Apr 2024 12:13:25 +0200
Subject: [PATCH 06/58] code with statistics working and saved to backend

---
 python/hsfs/hopsworks_udf.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index d23e1f45d2..9ec62200a9 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -86,6 +86,9 @@ def __init__(
         else:
             self._function_source: str = func
 
+        # TODO : Must clean this up : [Store actual source code]. The actual code without any clean up should be stored in the backed and the cleaned source must be dynamically build up when the data is read from the backend.
+        self._original_code = self._function_source
+
         if transformation_features:
             self._transformation_features: List[TransformationFeature] = (
                 transformation_features
@@ -264,7 +267,9 @@ def create_pandas_udf_return_schema_from_list(self, return_types: List[type]):
     def hopsworksUdf_wrapper(self):
         # TODO : clean this up
         function_source = "\t".join(self.function_source.splitlines(True))
-        if isinstance(self.return_type, List):
+        if (
+            isinstance(self.return_type, List) and len(self.return_type) > 1
+        ):  # TODO : This check must be cleaned up for sure
             code = f"""def renaming_wrapper(*args):
     import pandas as pd
     {function_source}
@@ -323,8 +328,10 @@ def get_udf(self):
 
     def to_dict(self):
         return {
-            "sourceCode": self.function_source,
-            "outputTypes": [python_type.__name__ for python_type in self.return_type]
+            "sourceCode": self._original_code,
+            "outputTypes": ",".join(
+                [python_type.__name__ for python_type in self.return_type]
+            )
             if isinstance(self.return_type, List)
             else self.return_type.__name__,
             "transformationFeatures": self.transformation_features,
@@ -341,7 +348,7 @@ def from_response_json(
         json_decamelized = humps.decamelize(json_dict)
         function_source_code = json_decamelized["source_code"]
         function_name = json_decamelized["name"]
-        return_type = json_decamelized["output_types"]
+        return_type = json_decamelized["output_types"].split(",")
         transformation_features = json_decamelized["transformation_features"].split(",")
 
         hopsworks_udf = cls(
@@ -354,7 +361,6 @@ def from_response_json(
             else cls.STRING_PYTHON_TYPES_MAPPING[return_type],
             name=function_name,
         )
-
         return hopsworks_udf(*transformation_features)
 
     @property

From 2e9aa72a07af9ef25a5b7302088bf19f7bced7be Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 29 Apr 2024 16:43:27 +0200
Subject: [PATCH 07/58] working code for feature vector

---
 python/hsfs/core/feature_view_api.py          | 17 ++--
 .../core/transformation_function_engine.py    | 82 +------------------
 python/hsfs/core/vector_server.py             | 37 +++++++--
 python/hsfs/hopsworks_udf.py                  | 16 +++-
 python/hsfs/transformation_function.py        | 38 +++++----
 5 files changed, 75 insertions(+), 115 deletions(-)

diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py
index ed5a8468c3..6ff621c7db 100644
--- a/python/hsfs/core/feature_view_api.py
+++ b/python/hsfs/core/feature_view_api.py
@@ -17,12 +17,7 @@
 
 from typing import List, Optional, Union
 
-from hsfs import (
-    client,
-    feature_view,
-    training_dataset,
-    transformation_function_attached,
-)
+from hsfs import client, feature_view, training_dataset, transformation_function
 from hsfs.client.exceptions import RestAPIError
 from hsfs.constructor import query, serving_prepared_statement
 from hsfs.core import explicit_provenance, job, training_dataset_job_conf
@@ -102,7 +97,9 @@ def get_by_name_version(self, name: str, version: int) -> feature_view.FeatureVi
         try:
             return feature_view.FeatureView.from_response_json(
                 self._client._send_request(
-                    self._GET, path, {"expand": ["query", "features"]}
+                    self._GET,
+                    path,
+                    {"expand": ["query", "features", "transformationfunctions"]},
                 )
             )
         except RestAPIError as e:
@@ -183,11 +180,11 @@ def get_serving_prepared_statement(
     def get_attached_transformation_fn(
         self, name: str, version: int
     ) -> Union[
-        "transformation_function_attached.TransformationFunctionAttached",
-        List["transformation_function_attached.TransformationFunctionAttached"],
+        "transformation_function.TransformationFunction",
+        List["transformation_function.TransformationFunction"],
     ]:
         path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION]
-        return transformation_function_attached.TransformationFunctionAttached.from_response_json(
+        return transformation_function.TransformationFunction.from_response_json(
             self._client._send_request("GET", path)
         )
 
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 492567e6ec..e99b79672a 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -21,6 +21,7 @@
 
 import hsfs
 import numpy
+
 from hsfs import (
     feature_view,
     statistics,
@@ -37,7 +38,6 @@
 )
 from hsfs.core.builtin_transformation_function import BuiltInTransformationFunction
 
-
 class TransformationFunctionEngine:
     BUILTIN_FN_NAMES = [
         "min_max_scaler",
@@ -62,16 +62,7 @@ def __init__(self, feature_store_id: int):
         self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None
         self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None
 
-    def save(self, transformation_fn_instance):
-        if self.is_builtin(transformation_fn_instance):
-            raise ValueError(
-                "Transformation function name '{name:}' with version 1 is reserved for built-in hsfs "
-                "functions. Please use other name or version".format(
-                    name=transformation_fn_instance.name
-                )
-            )
-        if not callable(transformation_fn_instance.transformation_fn):
-            raise ValueError("transformer must be callable")
+    def save(self, transformation_fn_instance: TransformationFunction):
         self._transformation_function_api.register_transformation_fn(
             transformation_fn_instance
         )
@@ -109,73 +100,6 @@ def get_td_transformation_fn(self, training_dataset):
             )
         return transformation_fn_dict
 
-    def is_builtin(self, transformation_fn_instance):
-        return (
-            transformation_fn_instance.name in self.BUILTIN_FN_NAMES
-            and transformation_fn_instance.version == 1
-        )
-
-    @staticmethod
-    def populate_builtin_fn_arguments(
-        feature_name, transformation_function_instance, feature_descriptive_stats
-    ):
-        # TODO : Make this statistics
-        if transformation_function_instance.name == "min_max_scaler":
-            min_value, max_value = BuiltInTransformationFunction.min_max_scaler_stats(
-                feature_descriptive_stats, feature_name
-            )
-            transformation_function_instance.transformation_fn = partial(
-                transformation_function_instance.transformation_fn,
-                min_value=min_value,
-                max_value=max_value,
-            )
-        elif transformation_function_instance.name == "standard_scaler":
-            mean, std_dev = BuiltInTransformationFunction.standard_scaler_stats(
-                feature_descriptive_stats, feature_name
-            )
-            transformation_function_instance.transformation_fn = partial(
-                transformation_function_instance.transformation_fn,
-                mean=mean,
-                std_dev=std_dev,
-            )
-        elif transformation_function_instance.name == "robust_scaler":
-            robust_scaler_stats = BuiltInTransformationFunction.robust_scaler_stats(
-                feature_descriptive_stats, feature_name
-            )
-            transformation_function_instance.transformation_fn = partial(
-                transformation_function_instance.transformation_fn,
-                p25=robust_scaler_stats[24],
-                p50=robust_scaler_stats[49],
-                p75=robust_scaler_stats[74],
-            )
-        elif transformation_function_instance.name == "label_encoder":
-            value_to_index = BuiltInTransformationFunction.encoder_stats(
-                feature_descriptive_stats, feature_name
-            )
-            transformation_function_instance.transformation_fn = partial(
-                transformation_function_instance.transformation_fn,
-                value_to_index=value_to_index,
-            )
-        else:
-            raise ValueError("Not implemented")
-
-        return transformation_function_instance
-
-    def populate_builtin_attached_fns(
-        self, attached_transformation_fns, feature_descriptive_stats
-    ):
-        # TODO : Remove
-        for ft_name in attached_transformation_fns:
-            if self.is_builtin(attached_transformation_fns[ft_name]):
-                # check if its built-in transformation function and populated with statistics arguments
-                transformation_fn = self.populate_builtin_fn_arguments(
-                    ft_name,
-                    attached_transformation_fns[ft_name],
-                    feature_descriptive_stats,
-                )
-                attached_transformation_fns[ft_name] = transformation_fn
-        return attached_transformation_fns
-
     @staticmethod
     def infer_spark_type(output_type):
         # TODO : Move to hopsworks_udf
@@ -217,6 +141,8 @@ def infer_spark_type(output_type):
         else:
             raise TypeError("Not supported type %s." % output_type)
 
+    # TODO : about statistics computation and fetching.
+
     # TODO : Think about what to do with label encoder features.
     @staticmethod
     def compute_transformation_fn_statistics(
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index ae35f326b8..2ed6d8688f 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -103,6 +103,7 @@ def __init__(
         self._inference_helper_col_name = [
             feat.name for feat in features if feat.inference_helper_column
         ]
+        self._transformed_feature_vector_col_name = None
 
         self._skip_fg_ids = skip_fg_ids or set()
         self._serving_keys = serving_keys or []
@@ -125,7 +126,7 @@ def __init__(
 
     def init_serving(
         self,
-        entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset],
+        entity: Union[feature_view.FeatureView],
         external: Optional[bool] = None,
         inference_helper_columns: bool = False,
         options: Optional[Dict[str, Any]] = None,
@@ -573,6 +574,7 @@ def get_inference_helpers(
             batch_results, batch=True, inference_helper=True, return_type=return_type
         )
 
+
     def which_client_and_ensure_initialised(
         self, force_rest_client: bool, force_sql_client: bool
     ) -> str:
@@ -634,15 +636,23 @@ def _set_default_client(
             self.default_client = self.DEFAULT_SQL_CLIENT
             self._init_sql_client = True
 
-    def apply_transformation(self, row_dict: Dict[str, Any]):
-        matching_keys = set(self.transformation_functions.keys()).intersection(
-            row_dict.keys()
-        )
+    def apply_transformation(self, row_dict: dict):
         _logger.debug("Applying transformation functions to : %s", matching_keys)
-        for feature_name in matching_keys:
-            row_dict[feature_name] = self.transformation_functions[
-                feature_name
-            ].transformation_fn(row_dict[feature_name])
+        for transformation_function in self.transformation_functions:
+            features = [
+                pd.Series(row_dict[feature])
+                for feature in transformation_function.hopsworks_udf.transformation_features
+            ]
+            transformed_result = transformation_function.hopsworks_udf.get_udf()(
+                *features
+            )
+            if isinstance(transformed_result, pd.Series):
+                row_dict[transformed_result.name] = transformed_result.values[0]
+            else:
+                for col in transformed_result:
+                    row_dict[transformed_result.name] = transformed_result[col].values[
+                        0
+                    ]
         return row_dict
 
     def apply_return_value_handlers(
@@ -1064,3 +1074,12 @@ def default_client(self, default_client: Literal["rest", "sql"]):
 
         _logger.debug(f"Default Online Store Client is set to {default_client}.")
         self._default_client = default_client
+
+    def transformed_feature_vector_col_name(self):
+        if self._transformed_feature_vector_col_name is None:
+            for transformation_function in self._transformation_functions:
+                self._transformed_feature_vector_col_name = (
+                    self._feature_vector_col_name
+                    + transformation_function.hopsworks_udf.transformation_feature_names
+                )
+        return self._transformed_feature_vector_col_name
\ No newline at end of file
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 9ec62200a9..d69af6c863 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -361,7 +361,11 @@ def from_response_json(
             else cls.STRING_PYTHON_TYPES_MAPPING[return_type],
             name=function_name,
         )
-        return hopsworks_udf(*transformation_features)
+        # TODO : Write proper comments for this use case. If we get a transformation function saved in the feature store then it will not have any specific transformaiton feature other than the ones in the code.
+        if "" not in transformation_features:
+            return hopsworks_udf(*transformation_features)
+        else:
+            return hopsworks_udf
 
     @property
     def return_type(self):
@@ -414,3 +418,13 @@ def transformation_statistics(self, statistics: List[FeatureDescriptiveStatistic
                 self._statistics[
                     self.statistics_argument_mapping[stat.feature_name]
                 ] = stat
+
+    @property
+    def transformation_feature_names(self) -> List[str]:
+        if isinstance(self.return_type, List) and len(self.return_type) > 1:
+            return [
+                f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}'
+                for i in range(len(self.return_type))
+            ]
+        else:
+            return [f'{self.function_name}<{"-".join(self.transformation_features)}>']
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index a731d604e1..270027a743 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -15,10 +15,11 @@
 from __future__ import annotations
 
 import json
-from typing import Optional
+from typing import List, Optional
 
 import humps
 from hsfs import util
+from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core import transformation_function_engine
 from hsfs.decorators import typechecked
 from hsfs.hopsworks_udf import HopsworksUdf
@@ -48,8 +49,11 @@ def __init__(
                 self._featurestore_id
             )
         )
+        if not isinstance(hopsworks_udf, HopsworksUdf):
+            raise FeatureStoreException(
+                "Use hopsworks_udf decorator when creating the feature view."
+            )
         self._hopsworks_udf = hopsworks_udf
-        self._name = hopsworks_udf.function_name
         self._feature_group_feature_name: Optional[str] = None
         self._feature_group_id: Optional[int] = None
 
@@ -102,20 +106,29 @@ def plus_one(value):
         """
         self._transformation_function_engine.delete(self)
 
+    def __call__(self, *args: List[str]):
+        self._hopsworks_udf = self._hopsworks_udf(*args)
+        return self
+
     @classmethod
     def from_response_json(cls, json_dict):
         json_decamelized = humps.decamelize(json_dict)
-
-        if json_decamelized.get("hopsworks_udf", False):
-            json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json(
-                json_decamelized["hopsworks_udf"]
-            )
-
+        print(json_decamelized)
+        # TODO : Clean this up.
         if "count" in json_decamelized:
             if json_decamelized["count"] == 0:
                 return []
+            for tffn_dto in json_decamelized["items"]:
+                if tffn_dto.get("hopsworks_udf", False):
+                    tffn_dto["hopsworks_udf"] = HopsworksUdf.from_response_json(
+                        tffn_dto["hopsworks_udf"]
+                    )
             return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]]
         else:
+            if json_decamelized.get("hopsworks_udf", False):
+                json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json(
+                    json_decamelized["hopsworks_udf"]
+                )
             return cls(**json_decamelized)
 
     def update_from_response_json(self, json_dict):
@@ -129,7 +142,6 @@ def json(self):
     def to_dict(self):
         return {
             "id": self._id,
-            "name": self._name,
             "version": self._version,
             "featurestoreId": self._featurestore_id,
             "hopsworksUdf": self._hopsworks_udf,
@@ -144,10 +156,6 @@ def id(self) -> id:
     def id(self, id: int):
         self._id = id
 
-    @property
-    def name(self) -> str:
-        return self._name
-
     @property
     def version(self) -> int:
         return self._version
@@ -156,10 +164,6 @@ def version(self) -> int:
     def hopsworks_udf(self) -> HopsworksUdf:
         return self._hopsworks_udf
 
-    @name.setter
-    def name(self, name: str):
-        self._name = name
-
     @version.setter
     def version(self, version: int):
         self._version = version

From fceb9b55e9f204eaa2e39048be29b6849244665c Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 2 May 2024 14:37:55 +0200
Subject: [PATCH 08/58] reformatted and documented Hopswork UDF class

---
 python/hsfs/hopsworks_udf.py | 532 ++++++++++++++++++++++++-----------
 1 file changed, 368 insertions(+), 164 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index d69af6c863..6544e9fdd5 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -18,8 +18,10 @@
 import copy
 import inspect
 import json
+import warnings
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
+from datetime import date, datetime, time
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import humps
 from hsfs import engine, util
@@ -27,9 +29,37 @@
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 
 
-def hopsworks_udf(return_type: Union[List[type], type]):
+def hopsworks_udf(output_type: Union[List[type], type]):
+    """
+    Create an User Defined Function that can be and used within the Hopsworks Feature Store.
+
+    Hopsworks UDF's are user defined functions that executes as 'pandas_udf' when executing
+    in spark engine and as pandas functions in the python engine. A Hopsworks udf is defined
+    using the `hopsworks_udf` decorator. The outputs of the defined UDF must be mentioned in the
+    decorator as a list of python types.
+
+
+    !!! example
+        ```python
+        from hsfs.hopsworks_udf import hopsworks_udf
+
+       @hopsworks_udf(float)
+        def add_one(data1 : pd.Series):
+            return data1 + 1
+        ```
+
+    # Arguments
+        output_type: `list`. The output types of the defined UDF
+
+    # Returns
+        `HopsworksUdf`: The metadata object for hopsworks UDF's.
+
+    # Raises
+        `hsfs.client.exceptions.FeatureStoreException` : If unable to create UDF.
+    """
+
     def wrapper(func: Callable):
-        udf = HopsworksUdf(func=func, return_type=return_type)
+        udf = HopsworksUdf(func=func, output_type=output_type)
         return udf
 
     return wrapper
@@ -37,6 +67,17 @@ def wrapper(func: Callable):
 
 @dataclass
 class TransformationFeature:
+    """
+    Mapping of feature names to their corresponding statistics argument names in the code.
+
+    The statistic_argument_name for a feature name would be None if the feature does not need statistics.
+
+    Attributes
+    ----------
+        feature_name (str) : Name of the feature.
+        statistic_argument_name (str) : Name of the statistics argument in the code for the feature specified in the feature name.
+    """
+
     feature_name: str
     statistic_argument_name: Optional[str]
 
@@ -49,83 +90,111 @@ def to_dict(self):
 
 class HopsworksUdf:
     """
-    Metadata class to store information about UDF
+    Meta data for user defined functions.
+
+    Stores meta data required to execute the user defined function in both spark and python engine.
+    The class generates uses the metadata to dynamically generate user defined functions based on the
+    engine it is executed in.
+
+    Attributes
+    ----------
+        output_type (List[str]) : Output types of the columns returned from the UDF.
+        function_name (str) : Name of the UDF
+        statistics_required (bool) : True if statistics is required for any of the parameters of the UDF.
+        transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable.
+        transformation_features (List[str]) : List of feature names to which the transformation function would be applied.
+        statistics_features (List[str]) : List of feature names that requires statistics.
     """
 
-    # TODO : Complete this
+    # Mapping for converting python types to spark types - required for creating pandas UDF's.
     PYTHON_SPARK_TYPE_MAPPING = {
         str: "string",
-        int: "int",
+        int: "bigint",
         float: "double",
-        # "timestamp": TimestampType(),
         bool: "boolean",
-        # "date": DateType(),
-        # "binary": BinaryType(),
+        datetime: "timestamp",
+        time: "timestamp",
+        date: "date",
     }
 
-    STRING_PYTHON_TYPES_MAPPING = {"str": str, "int": int, "float": float, "bool": bool}
-
     def __init__(
         self,
         func: Union[Callable, str],
-        return_type: Union[List[type], type],
-        name: str = None,
-        transformation_features: List[TransformationFeature] = None,
+        output_type: Union[List[type], type, List[str], str],
+        name: Optional[str] = None,
+        transformation_features: Optional[List[TransformationFeature]] = None,
     ):
-        if name is None:
-            self._function_name: str = func.__name__
-        else:
-            self._function_name: str = name
-
-        self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = dict()
+        self._output_type: List[str] = HopsworksUdf._validate_and_convert_output_types(
+            output_type
+        )
 
-        self._return_type: Union[List[type], type] = return_type
+        self._function_name: str = func.__name__ if name is None else name
 
-        if isinstance(func, Callable):
-            self._function_source: str = HopsworksUdf._extract_source_code(func)
-        else:
-            self._function_source: str = func
-
-        # TODO : Must clean this up : [Store actual source code]. The actual code without any clean up should be stored in the backed and the cleaned source must be dynamically build up when the data is read from the backend.
-        self._original_code = self._function_source
+        self._function_source: str = (
+            HopsworksUdf._extract_source_code(func)
+            if isinstance(func, Callable)
+            else func
+        )
 
-        if transformation_features:
-            self._transformation_features: List[TransformationFeature] = (
-                transformation_features
-            )
-        else:
-            self._transformation_features: List[TransformationFeature] = (
-                HopsworksUdf._extract_function_arguments(self.function_source)
-            )
+        self._transformation_features: List[TransformationFeature] = (
+            HopsworksUdf._extract_function_arguments(self._function_source)
+            if not transformation_features
+            else transformation_features
+        )
 
-        self._function_source = self._remove_argument(
-            self.function_source, "statistics"
+        self._formatted_function_source = HopsworksUdf._format_source_code(
+            self._function_source, self._transformation_features
         )
-        HopsworksUdf.validate_arguments(self.return_type)
 
-    def get_transformation_features(self):
-        return self.transformation_features
+        self._output_column_names: List[str] = self._get_output_column_names()
+
+        self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = None
 
     @staticmethod
-    def validate_arguments(return_type):
-        if isinstance(return_type, list):
-            for python_type in return_type:
-                if not isinstance(python_type, type):
-                    raise FeatureStoreException(
-                        f'Return types provided must be a python type or a list of python types. "{python_type}" is not python type'
-                    )
-        else:
-            if not isinstance(return_type, type):
+    def _validate_and_convert_output_types(
+        output_types: Union[List[type], List[str]],
+    ) -> List[str]:
+        """
+        Function that takes in a type or list of types validates if it is supported and return a list of strings
+
+        # Arguments
+            output_types: `list`. List of python types.
+
+        # Raises
+            `hsfs.client.exceptions.FeatureStoreException` : If the any of the output type is invalid
+        """
+        convert_output_types = []
+        output_types = (
+            output_types if isinstance(output_types, List) else [output_types]
+        )
+        for output_type in output_types:
+            if (
+                output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.keys()
+                and output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.values()
+            ):
                 raise FeatureStoreException(
-                    f'Return types provided must be a python type or a list of python types. "{return_type}" is not python type or a list'
+                    f"Output type {output_type} is not supported. Please refer to DOCUMENTATION to get more information on the supported types."
                 )
+            convert_output_types.append(
+                output_type
+                if isinstance(output_type, str)
+                else HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[output_type]
+            )
+        return convert_output_types
 
     @staticmethod
-    def _get_module_imports(path):
+    def _get_module_imports(path: str) -> List[str]:
+        """Function that extracts the imports used in the python file specified in the path.
+
+        # Arguments
+            path: `str`. Path to python file from which imports are to be extracted.
+
+        # Returns
+            `List[str]`: A list of string that contains the import statement using in the file.
+        """
         imports = []
         with open(path) as fh:
             root = ast.parse(fh.read(), path)
-
         for node in ast.iter_child_nodes(root):
             if isinstance(node, ast.Import):
                 imported_module = False
@@ -133,7 +202,6 @@ def _get_module_imports(path):
                 imported_module = node.module
             else:
                 continue
-
             for n in node.names:
                 if imported_module:
                     import_line = "from " + imported_module + " import " + n.name
@@ -145,7 +213,20 @@ def _get_module_imports(path):
         return imports
 
     @staticmethod
-    def _get_module_path(module_name):
+    def _get_module_path(module_name: str) -> str:
+        """
+        Function that returns the path to the source code of a python module.
+
+        Cannot extract path if the module is defined in a jupyter notebook since it is currently impossible find the path of a jupyter notebook.(https://github.com/ipython/ipython/issues/10123)
+
+        # Arguments
+            path: `str`. Path to python file from which imports are to be extracted.
+        # Raises
+            AttributeError : If the provided module is defined in a jupyter notebook.
+        # Returns
+            `str`: a string that contains the path to the module
+        """
+
         def _get_module_path(module):
             return module.__file__
 
@@ -156,18 +237,26 @@ def _get_module_path(module):
         return module_path["path"]
 
     @staticmethod
-    def _extract_source_code(udf_function):
+    def _extract_source_code(udf_function: Callable) -> str:
+        """
+        Function to extract the source code of the function along with the imports used in the file.
+
+        The module imports cannot be extracted if the function is defined in a jupyter notebook.
+
+        # Arguments
+            udf_function: `Callable`. Function for which the source code must be extracted.
+        # Returns
+            `str`: a string that contains the source code of function along with the extracted module imports.
+        """
         try:
             module_imports = HopsworksUdf._get_module_imports(
                 HopsworksUdf._get_module_path(udf_function.__module__)
             )
-        except Exception:
-            module_imports = ""
-            # TODO : Check if warning is actually required.
-            # warnings.warn(
-            #    "Passed UDF defined in a Jupyter notebook. Cannot extract dependices from a notebook. Please make sure to import all dependcies for the UDF inside the code.",
-            #    stacklevel=2,
-            # )
+        except AttributeError:
+            warnings.warn(
+                "Passed UDF defined in a Jupyter notebook. Cannot extract import dependencies from a notebook. Please make sure to import all dependencies for the UDF inside the function.",
+                stacklevel=2,
+            )
 
         function_code = inspect.getsource(udf_function)
         source_code = "\n".join(module_imports) + "\n" + function_code
@@ -175,22 +264,68 @@ def _extract_source_code(udf_function):
         return source_code
 
     @staticmethod
-    def _extract_function_arguments(source_code):
-        # Get source code of the original function
+    def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, int]:
+        """
+        Function to parse the source code to extract the argument along with the start and end line of the function signature
+
+        # Arguments
+            source_code: `str`. Source code of a function.
+        # Returns
+            `List[str]`: List of function arguments
+            `str`: function signature
+            `int`: starting line number of function signature
+            `int`: ending line number of function signature
+
+        """
         source_code = source_code.split("\n")
 
+        signature_start_line = None
+        signature_end_line = None
         # Find the line where the function signature is defined
         for i, line in enumerate(source_code):
             if line.strip().startswith("def "):
-                signature_line = i
+                signature_start_line = i
+            if signature_start_line is not None and ")" in line:
+                signature_end_line = i
                 break
 
         # Parse the function signature to remove the specified argument
-        signature = source_code[signature_line]
+        signature = "".join(
+            [
+                code.split("#")[0]
+                for code in source_code[signature_start_line : signature_end_line + 1]
+            ]
+        )
         arg_list = signature.split("(")[1].split(")")[0].split(",")
+        return arg_list, signature, signature_start_line, signature_end_line
+
+    @staticmethod
+    def _extract_function_arguments(source_code: str) -> List[TransformationFeature]:
+        """
+        Function to extract the argument names from a provided function source code.
+
+        # Arguments
+            source_code: `str`. Source code of a function.
+        # Returns
+            `List[TransformationFeature]`: List of TransformationFeature that provide a mapping from feature names to corresponding statistics parameters if any is present.
+        """
+        # Get source code of the original function
+        arg_list, _, _, _ = HopsworksUdf._parse_function_signature(source_code)
+
+        if arg_list == [""]:
+            raise FeatureStoreException(
+                "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function."
+            )
 
         arg_list = [arg.split(":")[0].strip() for arg in arg_list]
 
+        for arg in arg_list:
+            if arg.startswith("statistics"):
+                if arg.split("statistics_")[1] not in arg_list:
+                    raise FeatureStoreException(
+                        f"No argument corresponding to statistics parameter '{arg}' present in function definition."
+                    )
+
         return [
             TransformationFeature(
                 arg, f"statistics_{arg}" if f"statistics_{arg}" in arg_list else None
@@ -199,152 +334,202 @@ def _extract_function_arguments(source_code):
             if not arg.startswith("statistics")
         ]
 
-    def _remove_argument(self, source_code: str, arg_to_remove: str):
-        """ "
-        Function to remove statistics arguments from passed udf and type hinting.
-        Statistics arguments are removed since pandas UDF's do not accept extra arguments.
-        Statistics parameters are dynamically injected into the function scope.
+    @staticmethod
+    def _format_source_code(
+        source_code: str, transformation_features: List[TransformationFeature]
+    ) -> str:
         """
+        Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code.
 
-        # Get source code of the original function
-        source_code = source_code.split("\n")
+        # Arguments
+            source_code: `str`. Source code of a function.
+            transformation_features `List[TransformationFeature]`: List of transformation features provided in the function argument.
+        # Returns
+            `str`: Source code that does not contain any decorators, type hints or statistics parameters.
+        """
 
-        signature_start_line = None
-        signature_end_line = None
-        # Find the line where the function signature is defined
-        for i, line in enumerate(source_code):
-            if line.strip().startswith("def "):
-                signature_start_line = i
-            if signature_start_line is not None and ")" in line:
-                signature_end_line = i
-                break
+        _, signature, _, signature_end_line = HopsworksUdf._parse_function_signature(
+            source_code
+        )
 
-        # Parse the function signature to remove the specified argument
-        signature = "".join(source_code[signature_start_line : signature_end_line + 1])
-        arg_list = signature.split("(")[1].split(")")[0].split(",")
-        arg_list = [
-            arg.split(":")[0].strip()
-            for arg in arg_list
-            if (
-                arg_to_remove not in list(map(str.strip, arg.split(" ")))
-                and arg_to_remove not in list(map(str.strip, arg.split(":")))
-                and arg_to_remove not in list(map(str.strip, arg.split("_")))
-                and arg.strip() != arg_to_remove
-            )
-        ]
+        arg_list = [feature.feature_name for feature in transformation_features]
 
         # Reconstruct the function signature
         new_signature = (
-            signature.split("(")[0]
-            + "("
-            + ", ".join(arg_list)
-            + ")"
-            + signature.split(")")[1]
+            signature.split("(")[0].strip() + "(" + ", ".join(arg_list) + "):"
         )
+        source_code = source_code.split("\n")
         # Reconstruct the modified function as a string
         modified_source = (
-            new_signature + "\n" + "\n".join(source_code[signature_end_line + 1 :])
+            new_signature + "\n" + "\n\t".join(source_code[signature_end_line + 1 :])
         )
 
         # Define a new function with the modified source code
         return modified_source
 
-    @staticmethod
-    def get_spark_type(python_type: type):
-        return HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[python_type]
+    def _get_output_column_names(self) -> str:
+        """
+        Function that generates feature names for the transformed features
+
+        # Returns
+            `List[str]`: List of feature names for the transformed columns
+        """
+        if len(self.output_types) > 1:
+            return [
+                f'{self.function_name}_{"-".join(self.transformation_features)}_{i}'
+                for i in range(len(self.output_types))
+            ]
+        else:
+            return [f'{self.function_name}_{"-".join(self.transformation_features)}_']
+
+    def _create_pandas_udf_return_schema_from_list(self) -> str:
+        """
+        Function that creates the return schema required for executing the defined UDF's as pandas UDF's in Spark.
 
-    def create_pandas_udf_return_schema_from_list(self, return_types: List[type]):
-        if isinstance(return_types, List):
+        # Returns
+            `str`: DDL-formatted type string that denotes the return types of the user defined function.
+        """
+        if len(self.output_types) > 1:
             return ", ".join(
                 [
-                    f'`{self.function_name}<{"-".join(self.transformation_features)}>{i}` {HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types[i]]}'
-                    for i in range(len(return_types))
+                    f"{self.output_column_names[i]} {self.output_types[i]}"
+                    for i in range(len(self.output_types))
                 ]
             )
         else:
-            return f"{HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING[return_types]}"
-
-    def hopsworksUdf_wrapper(self):
-        # TODO : clean this up
-        function_source = "\t".join(self.function_source.splitlines(True))
-        if (
-            isinstance(self.return_type, List) and len(self.return_type) > 1
-        ):  # TODO : This check must be cleaned up for sure
+            return self.output_types[0]
+
+    def hopsworksUdf_wrapper(self) -> Callable:
+        """
+        Function that creates a dynamic wrapper function for the defined udf that renames the columns output by the UDF into specified column names.
+
+        The renames is done so that the column names match the schema expected by spark when multiple columns are returned in a pandas udf.
+        The wrapper function would be available in the main scope of the program.
+
+        # Returns
+            `Callable`: A wrapper function that renames outputs of the User defined function into specified output column names.
+        """
+        # Defining wrapper function that renames the column names to specific names
+        if len(self.output_types) > 1:
             code = f"""def renaming_wrapper(*args):
     import pandas as pd
-    {function_source}
+    {self._formatted_function_source}
     df = {self.function_name}(*args)
-    df = df.rename(columns = {{f'{{df.columns[i]}}':f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}' for i in range(len(df.columns))}})
+    df = df.rename(columns = {{df.columns[i]: _output_col_names[i] for i in range(len(df.columns))}})
     return df"""
         else:
             code = f"""def renaming_wrapper(*args):
     import pandas as pd
-    {function_source}
+    {self._formatted_function_source}
     df = {self.function_name}(*args)
-    df = df.rename(f'{self.function_name}<{"-".join(self.transformation_features)}>')
+    df = df.rename(_output_col_names[0])
     return df"""
+
+        # injecting variables into scope used to execute wrapper function.
         scope = __import__("__main__").__dict__
-        scope.update(self.transformation_statistics)
+        if self.transformation_statistics is not None:
+            scope.update(self.transformation_statistics)
+        scope.update({"_output_col_names": self.output_column_names})
+
+        # executing code
         exec(code, scope)
+
+        # returning executed function object
         return eval("renaming_wrapper", scope)
 
-    def __call__(self, *args: List[str]):
-        # TODO : Raise an execption if the number of features are incorrect.
-        if len(args) != len(self.transformation_features):
+    def __call__(self, *features: List[str]) -> "HopsworksUdf":
+        """
+        Set features to be passed as arguments to the user defined functions
+
+        # Arguments
+            features: Name of features to be passed to the User Defined function
+        # Returns
+            `HopsworksUdf`: Meta data class for the user defined function.
+        """
+
+        if len(features) != len(self.transformation_features):
             raise FeatureStoreException(
                 "Number of features provided does not match the number of features provided in the UDF definition"
             )
-        for arg in args:
+
+        for arg in features:
             if not isinstance(arg, str):
                 raise FeatureStoreException(
                     f'Feature names provided must be string "{arg}" is not string'
                 )
-        udf = copy.deepcopy(
-            self
-        )  # TODO : Clean this copy is needed so that if the uses the same function to multiple feature, if copy not done then all variable would share the same traanformation feature,
+        # Create a copy of the UDF to associate it with new feature names.
+        udf = copy.deepcopy(self)
+
         udf._transformation_features = [
             TransformationFeature(
                 new_feature_name, transformation_feature.statistic_argument_name
             )
             for transformation_feature, new_feature_name in zip(
-                self._transformation_features, args
+                self._transformation_features, features
             )
         ]
         return udf
 
-    def get_udf(self):
+    def get_udf(self) -> Callable:
+        """
+        Function that checks the current engine type and returns the appropriate UDF.
+
+        In the spark engine the UDF is returned as a pandas UDF.
+        While in the python engine the UDF is returned as python function.
+
+        # Returns
+            `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF.
+        """
         if engine.get_type() in ["hive", "python", "training"]:
             return self.hopsworksUdf_wrapper()
         else:
             from pyspark.sql.functions import pandas_udf
 
-            # TODO : Make this proper
             return pandas_udf(
                 f=self.hopsworksUdf_wrapper(),
-                returnType=self.create_pandas_udf_return_schema_from_list(
-                    self.return_type
-                ),
+                returnType=self._create_pandas_udf_return_schema_from_list(),
             )
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert class into a dictionary for json serialization.
+
+        # Returns
+            `Dict`: Dictionary that contains all data required to json serialize the object.
+        """
         return {
             "sourceCode": self._original_code,
             "outputTypes": ",".join(
-                [python_type.__name__ for python_type in self.return_type]
+                [python_type.__name__ for python_type in self.output_types]
             )
-            if isinstance(self.return_type, List)
-            else self.return_type.__name__,
+            if isinstance(self.output_types, List)
+            else self.output_types.__name__,
             "transformationFeatures": self.transformation_features,
             "name": self._function_name,
         }
 
     def json(self) -> str:
+        """
+        Json serialize object.
+
+        # Returns
+            `str`: Json serialized object.
+        """
         return json.dumps(self, cls=util.FeatureStoreEncoder)
 
     @classmethod
     def from_response_json(
         cls: "HopsworksUdf", json_dict: Dict[str, Any]
     ) -> "HopsworksUdf":
+        """
+        Function that deserializes json obtained from the java backend.
+
+        # Arguments
+            json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
+        # Returns
+            `HopsworksUdf`: Json deserialized class object.
+        """
+
         json_decamelized = humps.decamelize(json_dict)
         function_source_code = json_decamelized["source_code"]
         function_name = json_decamelized["name"]
@@ -361,41 +546,55 @@ def from_response_json(
             else cls.STRING_PYTHON_TYPES_MAPPING[return_type],
             name=function_name,
         )
-        # TODO : Write proper comments for this use case. If we get a transformation function saved in the feature store then it will not have any specific transformaiton feature other than the ones in the code.
+
+        # Set transformation features if already set.
         if "" not in transformation_features:
             return hopsworks_udf(*transformation_features)
         else:
             return hopsworks_udf
 
     @property
-    def return_type(self):
-        return self._return_type
+    def output_types(self) -> List[str]:
+        """Get the output types of the UDF"""
+        return self._output_type
 
     @property
-    def function_name(self):
+    def function_name(self) -> str:
+        """Get the function name of the UDF"""
         return self._function_name
 
     @property
-    def function_source(self):
-        return self._function_source
-
-    @property
-    def statistics_required(self):
+    def statistics_required(self) -> bool:
+        """Get if statistics for any feature is required by the UDF"""
         return bool(self.statistics_features)
 
     @property
-    def transformation_statistics(self):
+    def transformation_statistics(
+        self,
+    ) -> Optional[Dict[str, FeatureDescriptiveStatistics]]:
+        """Feature statistics required for the defined UDF"""
         return self._statistics
 
     @property
-    def transformation_features(self):
+    def output_column_names(self) -> List[str]:
+        """Output columns names of the transformation function"""
+        return self._output_column_names
+
+    @property
+    def transformation_features(self) -> List[str]:
+        """
+        List of feature names to be used in the User Defined Function.
+        """
         return [
             transformation_feature.feature_name
             for transformation_feature in self._transformation_features
         ]
 
     @property
-    def statistics_features(self):
+    def statistics_features(self) -> List[str]:
+        """
+        list of feature names that require statistics
+        """
         return [
             transformation_feature.feature_name
             for transformation_feature in self._transformation_features
@@ -403,28 +602,33 @@ def statistics_features(self):
         ]
 
     @property
-    def statistics_argument_mapping(self):
+    def _statistics_argument_mapping(self) -> Dict[str, str]:
+        """
+        Dictionary that maps feature names to the statistics arguments names in the User defined function.
+        """
         return {
             transformation_feature.feature_name: transformation_feature.statistic_argument_name
             for transformation_feature in self._transformation_features
         }
 
     @transformation_statistics.setter
-    def transformation_statistics(self, statistics: List[FeatureDescriptiveStatistics]):
-        # TODO : Clean this up
+    def transformation_statistics(
+        self, statistics: List[FeatureDescriptiveStatistics]
+    ) -> None:
         self._statistics = dict()
         for stat in statistics:
-            if stat.feature_name in self.statistics_argument_mapping.keys():
+            if stat.feature_name in self._statistics_argument_mapping.keys():
                 self._statistics[
-                    self.statistics_argument_mapping[stat.feature_name]
+                    self._statistics_argument_mapping[stat.feature_name]
                 ] = stat
 
-    @property
-    def transformation_feature_names(self) -> List[str]:
-        if isinstance(self.return_type, List) and len(self.return_type) > 1:
-            return [
-                f'{self.function_name}<{"-".join(self.transformation_features)}>{{i}}'
-                for i in range(len(self.return_type))
-            ]
+    @output_column_names.setter
+    def output_column_names(self, output_col_names: Union[str, List[str]]) -> None:
+        if not isinstance(output_col_names, List):
+            output_col_names = [output_col_names]
+        if len(output_col_names) != len(self.output_types):
+            raise FeatureStoreException(
+                f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.output_types)} names."
+            )
         else:
-            return [f'{self.function_name}<{"-".join(self.transformation_features)}>']
+            self._output_column_names = output_col_names

From 52167f1a930c9ce47d1f19b36dcfb9c34aff75c2 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 3 May 2024 10:02:24 +0200
Subject: [PATCH 09/58] unit tests for transformation functions

---
 python/hsfs/hopsworks_udf.py                  |  40 ++--
 python/hsfs/transformation_function.py        | 105 ++++++++---
 .../transformation_function_fixtures.json     |  90 ++++++---
 python/tests/test_transformation_function.py  | 174 +++++++++++++-----
 4 files changed, 300 insertions(+), 109 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 6544e9fdd5..b56efb2c5a 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -27,9 +27,10 @@
 from hsfs import engine, util
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
+from hsfs.decorators import typechecked
 
 
-def hopsworks_udf(output_type: Union[List[type], type]):
+def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf":
     """
     Create an User Defined Function that can be and used within the Hopsworks Feature Store.
 
@@ -58,8 +59,8 @@ def add_one(data1 : pd.Series):
         `hsfs.client.exceptions.FeatureStoreException` : If unable to create UDF.
     """
 
-    def wrapper(func: Callable):
-        udf = HopsworksUdf(func=func, output_type=output_type)
+    def wrapper(func: Callable) -> HopsworksUdf:
+        udf = HopsworksUdf(func=func, output_types=output_type)
         return udf
 
     return wrapper
@@ -81,13 +82,14 @@ class TransformationFeature:
     feature_name: str
     statistic_argument_name: Optional[str]
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Any]:
         return {
             "feature_name": self.feature_name,
             "statistic_argument_name": self.statistic_argument_name,
         }
 
 
+@typechecked
 class HopsworksUdf:
     """
     Meta data for user defined functions.
@@ -120,12 +122,12 @@ class HopsworksUdf:
     def __init__(
         self,
         func: Union[Callable, str],
-        output_type: Union[List[type], type, List[str], str],
+        output_types: Union[List[type], type, List[str], str],
         name: Optional[str] = None,
         transformation_features: Optional[List[TransformationFeature]] = None,
     ):
-        self._output_type: List[str] = HopsworksUdf._validate_and_convert_output_types(
-            output_type
+        self._output_types: List[str] = HopsworksUdf._validate_and_convert_output_types(
+            output_types
         )
 
         self._function_name: str = func.__name__ if name is None else name
@@ -253,6 +255,7 @@ def _extract_source_code(udf_function: Callable) -> str:
                 HopsworksUdf._get_module_path(udf_function.__module__)
             )
         except AttributeError:
+            module_imports = [""]
             warnings.warn(
                 "Passed UDF defined in a Jupyter notebook. Cannot extract import dependencies from a notebook. Please make sure to import all dependencies for the UDF inside the function.",
                 stacklevel=2,
@@ -445,6 +448,8 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
             features: Name of features to be passed to the User Defined function
         # Returns
             `HopsworksUdf`: Meta data class for the user defined function.
+        # Raises
+            `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings.
         """
 
         if len(features) != len(self.transformation_features):
@@ -533,18 +538,17 @@ def from_response_json(
         json_decamelized = humps.decamelize(json_dict)
         function_source_code = json_decamelized["source_code"]
         function_name = json_decamelized["name"]
-        return_type = json_decamelized["output_types"].split(",")
-        transformation_features = json_decamelized["transformation_features"].split(",")
+        output_types = [
+            output_type.strip()
+            for output_type in json_decamelized["output_types"].split(",")
+        ]
+        transformation_features = [
+            feature.strip()
+            for feature in json_decamelized["transformation_features"].split(",")
+        ]
 
         hopsworks_udf = cls(
-            func=function_source_code,
-            return_type=[
-                cls.STRING_PYTHON_TYPES_MAPPING[python_type]
-                for python_type in return_type
-            ]
-            if isinstance(return_type, List)
-            else cls.STRING_PYTHON_TYPES_MAPPING[return_type],
-            name=function_name,
+            func=function_source_code, output_types=output_types, name=function_name
         )
 
         # Set transformation features if already set.
@@ -556,7 +560,7 @@ def from_response_json(
     @property
     def output_types(self) -> List[str]:
         """Get the output types of the UDF"""
-        return self._output_type
+        return self._output_types
 
     @property
     def function_name(self) -> str:
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index 270027a743..1ba52dea4e 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import json
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import humps
 from hsfs import util
@@ -27,22 +27,31 @@
 
 @typechecked
 class TransformationFunction:
+    """
+    Main DTO class for transformation functions.
+
+    Attributes
+    ----------
+        id (int) : Id of transformation function.
+        version (int) : Version of transformation function.
+        hopsworks_udf (HopsworksUdf): Meta data class for user defined functions.
+    """
+
     def __init__(
         self,
         featurestore_id: int,
         hopsworks_udf: HopsworksUdf,
         version: Optional[int] = None,
         id: Optional[int] = None,
-        # TODO : Check if the below are actually needed
         type=None,
         items=None,
         count=None,
         href=None,
         **kwargs,
     ):
-        self._id = id
-        self._featurestore_id = featurestore_id
-        self._version = version
+        self._id: int = id
+        self._featurestore_id: int = featurestore_id
+        self._version: int = version
 
         self._transformation_function_engine = (
             transformation_function_engine.TransformationFunctionEngine(
@@ -51,25 +60,26 @@ def __init__(
         )
         if not isinstance(hopsworks_udf, HopsworksUdf):
             raise FeatureStoreException(
-                "Use hopsworks_udf decorator when creating the feature view."
+                "Please use the hopsworks_udf decorator when defining transformation functions."
             )
-        self._hopsworks_udf = hopsworks_udf
-        self._feature_group_feature_name: Optional[str] = None
-        self._feature_group_id: Optional[int] = None
 
-    def save(self):
+        self._hopsworks_udf: HopsworksUdf = hopsworks_udf
+
+    def save(self) -> None:
         """Persist transformation function in backend.
 
         !!! example
             ```python
+            # import hopsworks udf decorator
+            from hsfs.hopsworks_udf import HopsworksUdf
             # define function
+            @hopsworks_udf(int)
             def plus_one(value):
                 return value + 1
 
             # create transformation function
             plus_one_meta = fs.create_transformation_function(
                     transformation_function=plus_one,
-                    output_type=int,
                     version=1
                 )
 
@@ -79,19 +89,21 @@ def plus_one(value):
         """
         self._transformation_function_engine.save(self)
 
-    def delete(self):
+    def delete(self) -> None:
         """Delete transformation function from backend.
 
         !!! example
             ```python
+            # import hopsworks udf decorator
+            from hsfs.hopsworks_udf import HopsworksUdf
             # define function
+            @hopsworks_udf(int)
             def plus_one(value):
                 return value + 1
 
             # create transformation function
             plus_one_meta = fs.create_transformation_function(
                     transformation_function=plus_one,
-                    output_type=int,
                     version=1
                 )
             # persist transformation function in backend
@@ -106,15 +118,32 @@ def plus_one(value):
         """
         self._transformation_function_engine.delete(self)
 
-    def __call__(self, *args: List[str]):
-        self._hopsworks_udf = self._hopsworks_udf(*args)
+    def __call__(self, *features: List[str]) -> TransformationFunction:
+        """
+        Update the feature to be using in the transformation function
+
+        # Arguments
+            features: Name of features to be passed to the User Defined function
+        # Returns
+            `HopsworksUdf`: Meta data class for the user defined function.
+        # Raises
+            `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings.
+        """
+        self._hopsworks_udf = self._hopsworks_udf(*features)
         return self
 
     @classmethod
-    def from_response_json(cls, json_dict):
+    def from_response_json(cls, json_dict: Dict[str, Any]) -> TransformationFunction:
+        """
+        Function that deserializes json obtained from the java backend.
+
+        # Arguments
+            json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
+        # Returns
+            `TransformationFunction`: Json deserialized class object.
+        """
         json_decamelized = humps.decamelize(json_dict)
-        print(json_decamelized)
-        # TODO : Clean this up.
+
         if "count" in json_decamelized:
             if json_decamelized["count"] == 0:
                 return []
@@ -131,15 +160,37 @@ def from_response_json(cls, json_dict):
                 )
             return cls(**json_decamelized)
 
-    def update_from_response_json(self, json_dict):
+    def update_from_response_json(
+        self, json_dict: Dict[str, Any]
+    ) -> TransformationFunction:
+        """
+        Function that updates class based on the response obtained from the java backend.
+
+        # Arguments
+            json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
+        # Returns
+            `TransformationFunction`: Json deserialized class object.
+        """
         json_decamelized = humps.decamelize(json_dict)
         self.__init__(**json_decamelized)
         return self
 
-    def json(self):
+    def json(self) -> str:
+        """
+        Json serialize object.
+
+        # Returns
+            `str`: Json serialized object.
+        """
         return json.dumps(self, cls=util.FeatureStoreEncoder)
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert class into a dictionary for json serialization.
+
+        # Returns
+            `Dict`: Dictionary that contains all data required to json serialize the object.
+        """
         return {
             "id": self._id,
             "version": self._version,
@@ -153,17 +204,19 @@ def id(self) -> id:
         return self._id
 
     @id.setter
-    def id(self, id: int):
+    def id(self, id: int) -> None:
         self._id = id
 
     @property
     def version(self) -> int:
+        """Version of the transformation function."""
         return self._version
 
+    @version.setter
+    def version(self, version: int) -> None:
+        self._version = version
+
     @property
     def hopsworks_udf(self) -> HopsworksUdf:
+        """Meta data class for the user defined transformation function."""
         return self._hopsworks_udf
-
-    @version.setter
-    def version(self, version: int):
-        self._version = version
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 504671dffc..98017a07c5 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -1,16 +1,54 @@
 {
-  "get": {
+  "get_one_argument_no_statistics_function": {
     "response": {
-      "featurestore_id": 11,
-      "transformation_fn": null,
-      "version": 1,
-      "name": "test_name",
-      "source_code_content": "test_source_code_content",
-      "builtin_source_code": "test_builtin_source_code",
-      "output_type": "float",
-      "id": 43,
-      "type": "transformationFunctionTDO",
-      "href": "test_href"
+      "id" : 1,
+      "version": 2,
+      "featurestoreId": 11,
+      "hopsworksUdf":{
+        "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+        "name": "add_one_fs",
+        "outputTypes":"double",
+        "transformationFeatures":"col1"
+      }
+    }
+  },
+  "get_one_argument_with_statistics_function": {
+    "response": {
+      "id" : 1,
+      "version": 2,
+      "featurestoreId": 11,
+      "hopsworksUdf":{
+        "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+        "name": "add_mean_fs",
+        "outputTypes":"double",
+        "transformationFeatures":"data"
+      }
+    }
+  },
+  "get_multiple_argument_with_statistics_function": {
+    "response": {
+      "id" : 1,
+      "version": 2,
+      "featurestoreId": 11,
+      "hopsworksUdf":{
+        "sourceCode": "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return data1 + statistics_data1.mean\n",
+        "name": "test_func",
+        "outputTypes":"string",
+        "transformationFeatures":"feature1, feature2, feature3"
+      }
+    }
+  },
+  "get_multiple_return_type_functions": {
+    "response": {
+      "id" : 1,
+      "version": 2,
+      "featurestoreId": 11,
+      "hopsworksUdf":{
+        "sourceCode": "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n",
+        "name": "test_func",
+        "outputTypes":"string, double",
+        "transformationFeatures":"feature1, feature2, feature3"
+      }
     }
   },
   "get_basic_info": {
@@ -23,16 +61,26 @@
       "count": 1,
       "items": [
         {
-          "featurestore_id": 11,
-          "transformation_fn": null,
+          "id" : 1,
+          "version": 2,
+          "featurestoreId": 11,
+          "hopsworksUdf":{
+            "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+            "name": "add_mean_fs",
+            "outputTypes":"double",
+            "transformationFeatures":"data"
+          }
+        },
+        {
+          "id" : 2,
           "version": 1,
-          "name": "test_name",
-          "source_code_content": "test_source_code_content",
-          "builtin_source_code": "test_builtin_source_code",
-          "output_type": "float",
-          "id": 43,
-          "type": "transformationFunctionTDO",
-          "href": "test_href"
+          "featurestoreId": 11,
+          "hopsworksUdf":{
+            "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+            "name": "add_one_fs",
+            "outputTypes":"double",
+            "transformationFeatures":"col1"
+          }
         }
       ]
     }
@@ -43,4 +91,4 @@
       "items": []
     }
   }
-}
\ No newline at end of file
+}
diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py
index 41123ff791..0d1f29f346 100644
--- a/python/tests/test_transformation_function.py
+++ b/python/tests/test_transformation_function.py
@@ -15,85 +15,171 @@
 #
 
 
-from hsfs import transformation_function
+from hsfs.transformation_function import TransformationFunction
 
 
 class TestTransformationFunction:
-    def test_from_response_json(self, backend_fixtures):
+    def test_from_response_json_one_argument_no_statistics(self, backend_fixtures):
         # Arrange
-        json = backend_fixtures["transformation_function"]["get"]["response"]
+        json = backend_fixtures["transformation_function"][
+            "get_one_argument_no_statistics_function"
+        ]["response"]
 
         # Act
-        tf = transformation_function.TransformationFunction.from_response_json(json)
+        tf = TransformationFunction.from_response_json(json)
 
         # Assert
-        assert tf.id == 43
+        assert tf.id == 1
         assert tf._featurestore_id == 11
-        assert tf.version == 1
-        assert tf.name == "test_name"
-        assert tf.transformation_fn is None
-        assert tf.output_type == "FLOAT"
+        assert tf.version == 2
+        assert tf.hopsworks_udf.function_name == "add_one_fs"
+        assert tf.hopsworks_udf.output_types == ["double"]
+        assert not tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == ["col1"]
+        assert tf.hopsworks_udf.statistics_features == []
         assert (
-            tf.source_code_content
-            == '{"module_imports": "", "transformer_code": "test_builtin_source_code"}'
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
-        assert tf._feature_group_feature_name is None
-        assert tf._feature_group_id is None
 
-    def test_from_response_json_basic_info(self, mocker, backend_fixtures):
+    def test_from_response_json_one_argument_with_statistics(self, backend_fixtures):
         # Arrange
-        mocker.patch(
-            "hsfs.transformation_function.TransformationFunction._load_source_code"
+        json = backend_fixtures["transformation_function"][
+            "get_one_argument_with_statistics_function"
+        ]["response"]
+
+        # Act
+        tf = TransformationFunction.from_response_json(json)
+
+        # Assert
+        assert tf.id == 1
+        assert tf._featurestore_id == 11
+        assert tf.version == 2
+        assert tf.hopsworks_udf.function_name == "add_mean_fs"
+        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == ["data"]
+        assert tf.hopsworks_udf.statistics_features == ["data"]
+        assert (
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
         )
-        json = backend_fixtures["transformation_function"]["get_basic_info"]["response"]
+
+    def test_from_response_json_multiple_argument_with_statistics(
+        self, backend_fixtures
+    ):
+        # Arrange
+        json = backend_fixtures["transformation_function"][
+            "get_multiple_argument_with_statistics_function"
+        ]["response"]
 
         # Act
-        tf = transformation_function.TransformationFunction.from_response_json(json)
+        tf = TransformationFunction.from_response_json(json)
 
         # Assert
-        assert tf.id is None
+        assert tf.id == 1
         assert tf._featurestore_id == 11
-        assert tf.version is None
-        assert tf.name is None
-        assert tf.transformation_fn is None
-        assert tf.output_type == "STRING"
-        assert tf.source_code_content is None
-        assert tf._feature_group_feature_name is None
-        assert tf._feature_group_id is None
+        assert tf.version == 2
+        assert tf.hopsworks_udf.function_name == "test_func"
+        assert tf.hopsworks_udf.output_types == ["string"]
+        assert tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == [
+            "feature1",
+            "feature2",
+            "feature3",
+        ]
+        assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"]
+        assert (
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return data1 + statistics_data1.mean\n"
+        )
+
+    def test_from_response_json_multiple_return_type_functions(self, backend_fixtures):
+        # Arrange
+        json = backend_fixtures["transformation_function"][
+            "get_multiple_return_type_functions"
+        ]["response"]
+
+        # Act
+        tf = TransformationFunction.from_response_json(json)
+
+        # Assert
+        assert tf.id == 1
+        assert tf._featurestore_id == 11
+        assert tf.version == 2
+        assert tf.hopsworks_udf.function_name == "test_func"
+        assert tf.hopsworks_udf.output_types == ["string", "double"]
+        assert tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == [
+            "feature1",
+            "feature2",
+            "feature3",
+        ]
+        assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"]
+        assert (
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n"
+        )
+
+    def test_from_response_json_list_empty(self, backend_fixtures):
+        # Arrange
+        json = backend_fixtures["transformation_function"]["get_list_empty"]["response"]
+
+        # Act
+        tf_list = TransformationFunction.from_response_json(json)
+
+        # Assert
+        assert len(tf_list) == 0
 
     def test_from_response_json_list(self, backend_fixtures):
         # Arrange
         json = backend_fixtures["transformation_function"]["get_list"]["response"]
 
         # Act
-        tf_list = transformation_function.TransformationFunction.from_response_json(
-            json
-        )
+        tf_list = TransformationFunction.from_response_json(json)
 
         # Assert
-        assert len(tf_list) == 1
+        assert len(tf_list) == 2
         tf = tf_list[0]
-        assert tf.id == 43
+        assert tf.id == 1
+        assert tf._featurestore_id == 11
+        assert tf.version == 2
+        assert tf.hopsworks_udf.function_name == "add_mean_fs"
+        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == ["data"]
+        assert tf.hopsworks_udf.statistics_features == ["data"]
+        assert (
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+        )
+
+        tf = tf_list[1]
+        assert tf.id == 2
         assert tf._featurestore_id == 11
         assert tf.version == 1
-        assert tf.name == "test_name"
-        assert tf.transformation_fn is None
-        assert tf.output_type == "FLOAT"
+        assert tf.hopsworks_udf.function_name == "add_one_fs"
+        assert tf.hopsworks_udf.output_types == ["double"]
+        assert not tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == ["col1"]
+        assert tf.hopsworks_udf.statistics_features == []
         assert (
-            tf.source_code_content
-            == '{"module_imports": "", "transformer_code": "test_builtin_source_code"}'
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
-        assert tf._feature_group_feature_name is None
-        assert tf._feature_group_id is None
 
-    def test_from_response_json_list_empty(self, backend_fixtures):
+
+"""
+    def test_from_response_json_basic_info(self, mocker, backend_fixtures):
         # Arrange
-        json = backend_fixtures["transformation_function"]["get_list_empty"]["response"]
+        json = backend_fixtures["transformation_function"]["get_basic_info"]["response"]
 
         # Act
-        tf_list = transformation_function.TransformationFunction.from_response_json(
-            json
-        )
+        tf = TransformationFunction.from_response_json(json)
 
         # Assert
-        assert len(tf_list) == 0
+        assert tf.id is None
+        assert tf._featurestore_id == 11
+        assert tf.version is None
+        assert tf.hopsworks_udf is None
+"""

From a66f9e3d32c0e65cf597dd367c4f2ec1a7f4f59b Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 3 May 2024 16:06:05 +0200
Subject: [PATCH 10/58] clearning transformations engine and adding unit tests

---
 .../core/transformation_function_engine.py    |  181 +-
 python/hsfs/transformation_function.py        |   10 +-
 .../test_transformation_function_engine.py    | 1461 ++---------------
 python/tests/test_transformation_function.py  |   38 +-
 4 files changed, 257 insertions(+), 1433 deletions(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index e99b79672a..0ad86f0c53 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -15,12 +15,19 @@
 #
 from __future__ import annotations
 
-import datetime
-from functools import partial
-from typing import Dict, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
 
-import hsfs
-import numpy
+from hsfs import training_dataset
+from hsfs.core import statistics_api, transformation_function_api
+
+
+if TYPE_CHECKING:
+    import pandas as pd
+    import polars as pl
+    import pyspark.sql as ps
+    from hsfs.feature_view import FeatureView
+    from hsfs.statistics import Statistics
+    from hsfs.transformation_function import TransformationFunction
 
 from hsfs import (
     feature_view,
@@ -53,27 +60,58 @@ class TransformationFunctionEngine:
 
     def __init__(self, feature_store_id: int):
         self._feature_store_id = feature_store_id
-        self._transformation_function_api = (
-            transformation_function_api.TransformationFunctionApi(feature_store_id)
+        self._transformation_function_api: transformation_function_api.TransformationFunctionApi = transformation_function_api.TransformationFunctionApi(
+            feature_store_id
         )
-        self._statistics_api = statistics_api.StatisticsApi(
-            feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE
+        self._statistics_api: statistics_api.StatisticsApi = (
+            statistics_api.StatisticsApi(
+                feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE
+            )
         )
         self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None
         self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None
 
-    def save(self, transformation_fn_instance: TransformationFunction):
+    def save(
+        self, transformation_fn_instance: TransformationFunction
+    ) -> TransformationFunction:
+        """
+        Save a transformation function into the feature store.
+
+        # Argument
+            transformation_fn_instance `TransformationFunction`: The transformation function to be saved into the feature store.
+        """
         self._transformation_function_api.register_transformation_fn(
             transformation_fn_instance
         )
 
-    def get_transformation_fn(self, name, version=None):
+    def get_transformation_fn(
+        self, name: str, version: Optional[int] = None
+    ) -> Union[TransformationFunction, List[TransformationFunction]]:
+        """
+        Retrieve a transformation function from the feature store.
+
+        If only the name of the transformation function is provided then all the versions of the transformation functions are returned as a list.
+        If both name and version are not provided then all transformation functions saved in the feature view is returned.
+
+        # Argument
+            name ` Optional[str]`: The name of the transformation function to be retrieved.
+            version `Optional[int]`: The version of the transformation function to be retrieved.
+        # Returns
+            `Union[TransformationFunction, List[TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided.
+        """
+
         transformation_fn_instances = (
             self._transformation_function_api.get_transformation_fn(name, version)
         )
-        return transformation_fn_instances[0]
+        return transformation_fn_instances
+
+    def get_transformation_fns(self) -> List[TransformationFunction]:
+        """
+        Get all the transformation functions in the feature store
 
-    def get_transformation_fns(self):
+        # Returns
+            `List[TransformationFunction]` : A list of transformation functions.
+        """
         transformation_fn_instances = (
             self._transformation_function_api.get_transformation_fn(
                 name=None, version=None
@@ -86,89 +124,71 @@ def get_transformation_fns(self):
             transformation_fns.append(transformation_fn_instance)
         return transformation_fns
 
-    def delete(self, transformation_function_instance):
-        self._transformation_function_api.delete(transformation_function_instance)
-
-    def get_td_transformation_fn(self, training_dataset):
-        attached_transformation_fns = (
-            self._transformation_function_api.get_td_transformation_fn(training_dataset)
-        )
-        transformation_fn_dict = {}
-        for attached_transformation_fn in attached_transformation_fns:
-            transformation_fn_dict[attached_transformation_fn.name] = (
-                attached_transformation_fn.transformation_function
-            )
-        return transformation_fn_dict
-
-    @staticmethod
-    def infer_spark_type(output_type):
-        # TODO : Move to hopsworks_udf
-        if not output_type:
-            return "STRING"  # STRING is default type for spark udfs
-
-        if isinstance(output_type, str):
-            if output_type.endswith("Type()"):
-                return util.translate_legacy_spark_type(output_type)
-            output_type = output_type.lower()
+    def delete(self, transformation_function_instance: TransformationFunction) -> None:
+        """
+        Delete a transformation function from the feature store.
 
-        if output_type in (str, "str", "string"):
-            return "STRING"
-        elif output_type in (bytes, "binary"):
-            return "BINARY"
-        elif output_type in (numpy.int8, "int8", "byte", "tinyint"):
-            return "BYTE"
-        elif output_type in (numpy.int16, "int16", "short", "smallint"):
-            return "SHORT"
-        elif output_type in (int, "int", "integer", numpy.int32):
-            return "INT"
-        elif output_type in (numpy.int64, "int64", "long", "bigint"):
-            return "LONG"
-        elif output_type in (float, "float"):
-            return "FLOAT"
-        elif output_type in (numpy.float64, "float64", "double"):
-            return "DOUBLE"
-        elif output_type in (
-            datetime.datetime,
-            numpy.datetime64,
-            "datetime",
-            "timestamp",
-        ):
-            return "TIMESTAMP"
-        elif output_type in (datetime.date, "date"):
-            return "DATE"
-        elif output_type in (bool, "boolean", "bool"):
-            return "BOOLEAN"
-        else:
-            raise TypeError("Not supported type %s." % output_type)
-
-    # TODO : about statistics computation and fetching.
+        # Arguments
+            transformation_function_instance `TransformationFunction`: The transformation function to be removed from the feature store.
+        """
+        self._transformation_function_api.delete(transformation_function_instance)
 
-    # TODO : Think about what to do with label encoder features.
     @staticmethod
     def compute_transformation_fn_statistics(
-        training_dataset_obj,
-        builtin_tffn_features,
-        label_encoder_features,
-        feature_dataframe,
-        feature_view_obj,
-    ) -> statistics.Statistics:
+        training_dataset_obj: training_dataset.TrainingDataset,
+        statistics_features: List[str],
+        label_encoder_features: List[str],
+        feature_dataframe: Union[pd.DataFrame, pl.DataFrame, ps.DataFrame],
+        feature_view_obj: FeatureView,
+    ) -> Statistics:
+        """
+        Compute the statistics required for a training dataset object.
+
+        # Arguments
+            training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed.
+            statistics_features `List[str]`: The list of features for which the statistics should be computed.
+            label_encoder_features `List[str]`: Features used for label encoding.
+            feature_dataframe `Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]`: The dataframe that contains the data for which the statistics must be computed.
+            feature_view_obj `FeatureView`: The feature view in which the training data is being created.
+        # Returns
+            `Statistics` : The statistics object that contains the statistics for each features.
+        """
         return training_dataset_obj._statistics_engine.compute_transformation_fn_statistics(
             td_metadata_instance=training_dataset_obj,
-            columns=builtin_tffn_features,  # excluding label encoded features
+            columns=statistics_features,
             label_encoder_features=label_encoder_features,  # label encoded features only
             feature_dataframe=feature_dataframe,
             feature_view_obj=feature_view_obj,
         )
 
     @staticmethod
-    def add_feature_statistics(training_dataset, feature_view_obj, dataset):
-        # TODO : Optimize this code portion check which i better computing all transformation feature statistics together or one by one.
-        statistics_features = set()
+    def compute_and_set_feature_statistics(
+        training_dataset: training_dataset.TrainingDataset,
+        feature_view_obj: FeatureView,
+        dataset: Union[
+            Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]],
+            Union[pd.DataFrame, pl.DataFrame, ps.DataFrame],
+        ],
+    ) -> None:
+        """
+        Function that computes and sets the statistics required for the UDF used for transformation.
+
+        The function assigns the statistics computed to hopsworks UDF object so that the statistics can be used when UDF is executed.
+
+        # Argument
+            training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed.
+            feature_view `FeatureView`: The feature view in which the training data is being created.
+            dataset `Union[Dict[str,  Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]],  Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]]`: A dataframe that conqtains the training data or a dictionary that contains both the training and test data.
+        """
+        statistics_features: Set[str] = set()
+
+        # Finding the features for which statistics is required
         for transformation_function in feature_view_obj.transformation_functions:
             statistics_features.update(
                 transformation_function.hopsworks_udf.statistics_features
             )
 
+        # compute statistics on training data
         if training_dataset.splits:
             # compute statistics before transformations are applied
             stats = TransformationFunctionEngine.compute_transformation_fn_statistics(
@@ -179,7 +199,6 @@ def add_feature_statistics(training_dataset, feature_view_obj, dataset):
                 feature_view_obj,
             )
         else:
-            # compute statistics before transformations are applied
             stats = TransformationFunctionEngine.compute_transformation_fn_statistics(
                 training_dataset,
                 list(statistics_features),
@@ -187,6 +206,8 @@ def add_feature_statistics(training_dataset, feature_view_obj, dataset):
                 dataset,
                 feature_view_obj,
             )
+
+        # Set statistics computed in the hopsworks UDF
         for transformation_function in feature_view_obj.transformation_functions:
             transformation_function.hopsworks_udf.transformation_statistics = (
                 stats.feature_descriptive_statistics
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index 1ba52dea4e..0b209bf5c4 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import json
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import humps
 from hsfs import util
@@ -66,7 +66,7 @@ def __init__(
         self._hopsworks_udf: HopsworksUdf = hopsworks_udf
 
     def save(self) -> None:
-        """Persist transformation function in backend.
+        """Save a transformation function into the backend.
 
         !!! example
             ```python
@@ -123,7 +123,7 @@ def __call__(self, *features: List[str]) -> TransformationFunction:
         Update the feature to be using in the transformation function
 
         # Arguments
-            features: Name of features to be passed to the User Defined function
+            features: `List[str]`. Name of features to be passed to the User Defined function
         # Returns
             `HopsworksUdf`: Meta data class for the user defined function.
         # Raises
@@ -133,7 +133,9 @@ def __call__(self, *features: List[str]) -> TransformationFunction:
         return self
 
     @classmethod
-    def from_response_json(cls, json_dict: Dict[str, Any]) -> TransformationFunction:
+    def from_response_json(
+        cls, json_dict: Dict[str, Any]
+    ) -> Union[TransformationFunction, List[TransformationFunction]]:
         """
         Function that deserializes json obtained from the java backend.
 
diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py
index fcbb85ab21..ff3c4f4f85 100644
--- a/python/tests/core/test_transformation_function_engine.py
+++ b/python/tests/core/test_transformation_function_engine.py
@@ -14,10 +14,7 @@
 #   limitations under the License.
 #
 
-import datetime
-
-import numpy
-import pytest
+import pandas as pd
 from hsfs import (
     engine,
     feature,
@@ -25,11 +22,9 @@
     feature_view,
     training_dataset,
     transformation_function,
-    transformation_function_attached,
 )
-from hsfs.client.exceptions import FeatureStoreException
-from hsfs.constructor.query import Query
 from hsfs.core import transformation_function_engine
+from hsfs.hopsworks_udf import hopsworks_udf
 
 
 fg1 = feature_group.FeatureGroup(
@@ -88,9 +83,6 @@ def test_save(self, mocker):
         # Arrange
         feature_store_id = 99
 
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin"
-        )
         mock_tf_api = mocker.patch(
             "hsfs.core.transformation_function_api.TransformationFunctionApi"
         )
@@ -99,61 +91,25 @@ def test_save(self, mocker):
             feature_store_id
         )
 
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, builtin_source_code="", output_type="str", name="tf_name"
-        )
-
-        # Act
-        with pytest.raises(ValueError) as e_info:
-            tf_engine.save(transformation_fn_instance=tf)
-
-        # Assert
-        assert mock_tf_api.return_value.register_transformation_fn.call_count == 0
-        assert (
-            str(e_info.value)
-            == "Transformation function name 'tf_name' with version 1 is reserved for built-in "
-            "hsfs functions. Please use other name or version"
-        )
-
-    def test_save_is_builtin(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mock_tf_engine_is_builtin = mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin"
-        )
-        mock_tf_api = mocker.patch(
-            "hsfs.core.transformation_function_api.TransformationFunctionApi"
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
+        @hopsworks_udf(int)
+        def testFunction(col1):
+            return col1 + 1
 
         tf = transformation_function.TransformationFunction(
-            feature_store_id, builtin_source_code="", output_type="str", name="tf_name"
+            feature_store_id,
+            hopsworks_udf=testFunction,
         )
 
-        mock_tf_engine_is_builtin.return_value = False
-
         # Act
-        with pytest.raises(ValueError) as e_info:
-            tf_engine.save(transformation_fn_instance=tf)
+        tf_engine.save(transformation_fn_instance=tf)
 
         # Assert
-        assert mock_tf_api.return_value.register_transformation_fn.call_count == 0
-        assert str(e_info.value) == "transformer must be callable"
+        assert mock_tf_api.return_value.register_transformation_fn.call_count == 1
 
-    def test_save_is_builtin_callable(self, mocker):
+    def test_get_transformation_fn(self, mocker):
         # Arrange
         feature_store_id = 99
 
-        mocker.patch(
-            "hsfs.transformation_function.TransformationFunction._extract_source_code"
-        )
-        mock_tf_engine_is_builtin = mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin"
-        )
         mock_tf_api = mocker.patch(
             "hsfs.core.transformation_function_api.TransformationFunctionApi"
         )
@@ -162,43 +118,25 @@ def test_save_is_builtin_callable(self, mocker):
             feature_store_id
         )
 
-        def testFunction():
-            print("Test")
+        @hopsworks_udf(int)
+        def testFunction1(col1):
+            return col1 + 1
 
-        tf = transformation_function.TransformationFunction(
+        tf1 = transformation_function.TransformationFunction(
             feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
+            hopsworks_udf=testFunction1,
         )
 
-        mock_tf_engine_is_builtin.return_value = False
-
-        # Act
-        tf_engine.save(transformation_fn_instance=tf)
-
-        # Assert
-        assert mock_tf_api.return_value.register_transformation_fn.call_count == 1
-
-    def test_get_transformation_fn(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mock_tf_api = mocker.patch(
-            "hsfs.core.transformation_function_api.TransformationFunctionApi"
-        )
+        @hopsworks_udf(float)
+        def testFunction2(data2, statistics_data2):
+            return data2 + 1
 
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
+        tf2 = transformation_function.TransformationFunction(
+            feature_store_id,
+            hopsworks_udf=testFunction2,
         )
 
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, builtin_source_code="", output_type="str", name="tf_name"
-        )
-        tf1 = transformation_function.TransformationFunction(
-            feature_store_id, builtin_source_code="", output_type="str", name="tf1_name"
-        )
-        transformations = [tf, tf1]
+        transformations = [tf1, tf2]
 
         mock_tf_api.return_value.get_transformation_fn.return_value = transformations
 
@@ -207,7 +145,7 @@ def test_get_transformation_fn(self, mocker):
 
         # Assert
         assert mock_tf_api.return_value.get_transformation_fn.call_count == 1
-        assert result == tf
+        assert result == transformations
 
     def test_get_transformation_fns(self, mocker):
         # Arrange
@@ -221,13 +159,25 @@ def test_get_transformation_fns(self, mocker):
             feature_store_id
         )
 
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, builtin_source_code="", output_type="str", name="tf_name"
-        )
+        @hopsworks_udf(int)
+        def testFunction1(col1):
+            return col1 + 1
+
         tf1 = transformation_function.TransformationFunction(
-            feature_store_id, builtin_source_code="", output_type="str", name="tf1_name"
+            feature_store_id,
+            hopsworks_udf=testFunction1,
+        )
+
+        @hopsworks_udf(float)
+        def testFunction2(data2, statistics_data2):
+            return data2 + 1
+
+        tf2 = transformation_function.TransformationFunction(
+            feature_store_id,
+            hopsworks_udf=testFunction2,
         )
-        transformations = [tf, tf1]
+
+        transformations = [tf1, tf2]
 
         mock_tf_api.return_value.get_transformation_fn.return_value = transformations
 
@@ -250,1332 +200,169 @@ def test_delete(self, mocker):
             feature_store_id
         )
 
-        # Act
-        tf_engine.delete(transformation_function_instance=None)
-
-        # Assert
-        assert mock_tf_api.return_value.delete.call_count == 1
-
-    def test_get_td_transformation_fn(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mock_tf_api = mocker.patch(
-            "hsfs.core.transformation_function_api.TransformationFunctionApi"
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def plus_one(a):
-            return a + 1
-
-        tf_attached = transformation_function_attached.TransformationFunctionAttached(
-            name="tf_name", transformation_function=plus_one
-        )
-        tf1_attached = transformation_function_attached.TransformationFunctionAttached(
-            name="tf1_name", transformation_function=plus_one
-        )
-
-        transformations_attached = [tf_attached, tf1_attached]
+        @hopsworks_udf(int)
+        def testFunction1(col1):
+            return col1 + 1
 
-        mock_tf_api.return_value.get_td_transformation_fn.return_value = (
-            transformations_attached
+        tf1 = transformation_function.TransformationFunction(
+            feature_store_id,
+            hopsworks_udf=testFunction1,
         )
 
         # Act
-        result = tf_engine.get_td_transformation_fn(training_dataset=None)
+        tf_engine.delete(transformation_function_instance=tf1)
 
         # Assert
-        assert "tf_name" in result
-        assert "tf1_name" in result
-        assert mock_tf_api.return_value.get_td_transformation_fn.call_count == 1
+        assert mock_tf_api.return_value.delete.call_count == 1
 
-    def test_attach_transformation_fn_td(self, mocker):
+    def test_compute_transformation_fn_statistics(self, mocker):
         # Arrange
         feature_store_id = 99
 
         mocker.patch("hsfs.client.get_instance")
-        mocker.patch("hsfs.constructor.fs_query.FsQuery")
+        mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine")
 
         tf_engine = transformation_function_engine.TransformationFunctionEngine(
             feature_store_id
         )
 
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["tf1_name"] = tf
-
         td = training_dataset.TrainingDataset(
             name="test",
             version=1,
             data_format="CSV",
-            featurestore_id=feature_store_id,
+            featurestore_id=99,
             splits={},
             id=10,
-            transformation_functions=transformation_fn_dict,
         )
 
         # Act
-        with pytest.raises(AttributeError) as e_info:
-            tf_engine.attach_transformation_fn(
-                training_dataset_obj=td, feature_view_obj=None
-            )
+        tf_engine.compute_transformation_fn_statistics(
+            training_dataset_obj=td,
+            statistics_features=None,
+            label_encoder_features=None,
+            feature_dataframe=None,
+            feature_view_obj=None,
+        )
 
         # Assert
-        assert str(e_info.value) == "'TrainingDataset' object has no attribute 'labels'"
+        assert (
+            mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
+            == 1
+        )
 
-    def test_attach_transformation_fn_fv(self, mocker):
-        # Arrange
+    def test_compute_and_set_feature_statistics_no_split(self, mocker):
         feature_store_id = 99
-
         mocker.patch("hsfs.client.get_instance")
+        mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine")
 
         tf_engine = transformation_function_engine.TransformationFunctionEngine(
             feature_store_id
         )
 
-        def testFunction():
-            print("Test")
+        @hopsworks_udf(int)
+        def testFunction1(col1):
+            return col1 + 1
 
-        tf = transformation_function.TransformationFunction(
+        tf1 = transformation_function.TransformationFunction(
             feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
+            hopsworks_udf=testFunction1,
         )
 
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["tf1_name"] = tf
-
-        fv = feature_view.FeatureView(
-            name="test",
-            query=query,
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
             featurestore_id=99,
-            transformation_functions=transformation_fn_dict,
-            labels=[],
-        )
-
-        # Act
-        tf_engine.attach_transformation_fn(
-            training_dataset_obj=None, feature_view_obj=fv
-        )
-
-        # Assert
-        assert len(fv._features) == 2
-        assert fv._features[0].name == "tf_name"
-        assert fv._features[1].name == "tf1_name"
-
-    def test_attach_transformation_fn_fv_self_join(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("label")],
+            id=11,
+            stream=False,
         )
 
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["fg1_tf_name"] = tf
-
-        fv = feature_view.FeatureView(
+        td = training_dataset.TrainingDataset(
             name="test",
-            query=query_self_join,
+            version=1,
+            data_format="CSV",
             featurestore_id=99,
-            transformation_functions=transformation_fn_dict,
-            labels=[],
+            splits={},
+            id=10,
         )
 
         # Act
-        tf_engine.attach_transformation_fn(
-            training_dataset_obj=None, feature_view_obj=fv
-        )
-
-        # Assert
-        assert len(fv._features) == 2
-        assert fv._features[0].name == "tf_name"
-        assert fv._features[1].name == "fg1_tf_name"
-
-    def test_attach_transformation_fn_fv_q_prefix(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["second_tf1_name"] = tf
-        transformation_fn_dict["third_tf_name"] = tf
-        transformation_fn_dict["third_tf1_name"] = tf
-
         fv = feature_view.FeatureView(
             name="test",
-            query=query_prefix,
-            featurestore_id=99,
-            transformation_functions=transformation_fn_dict,
-            labels=[],
+            featurestore_id=feature_store_id,
+            query=fg1.select_all(),
+            transformation_functions=[tf1],
         )
 
+        dataset = pd.DataFrame()
+
         # Act
-        tf_engine.attach_transformation_fn(
-            training_dataset_obj=None, feature_view_obj=fv
+        tf_engine.compute_and_set_feature_statistics(
+            training_dataset=td, feature_view_obj=fv, dataset=dataset
         )
 
         # Assert
-        assert len(fv._features) == 4
-        assert fv._features[0].name == "tf_name"
-        assert fv._features[1].name == "second_tf1_name"
-        assert fv._features[2].name == "third_tf_name"
-        assert fv._features[3].name == "third_tf1_name"
+        assert (
+            mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
+            == 1
+        )
 
-    def test_attach_transformation_fn_fv_q_prefix_fail(self, mocker):
-        # Arrange
+    def test_compute_and_set_feature_statistics_train_test_split(self, mocker):
         feature_store_id = 99
-
         mocker.patch("hsfs.client.get_instance")
+        mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine")
 
         tf_engine = transformation_function_engine.TransformationFunctionEngine(
             feature_store_id
         )
 
-        def testFunction():
-            print("Test")
-
-        query_no_prefix = (
-            fg1.select_all()
-            .join(fg2.select(["tf1_name"]), on=["id"])
-            .join(fg3.select(["tf_name", "tf1_name"]), on=["id"])
-        )
+        @hopsworks_udf(int)
+        def testFunction1(col1):
+            return col1 + 1
 
-        tf = transformation_function.TransformationFunction(
+        tf1 = transformation_function.TransformationFunction(
             feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
+            hopsworks_udf=testFunction1,
         )
 
-        transformation_fn_dict = dict()
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["tf1_name"] = tf
-
-        fv = feature_view.FeatureView(
-            name="test",
-            query=query_no_prefix,
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
             featurestore_id=99,
-            transformation_functions=transformation_fn_dict,
-            labels=[],
-        )
-
-        # Act
-        with pytest.raises(FeatureStoreException) as e_info:
-            tf_engine.attach_transformation_fn(
-                training_dataset_obj=None, feature_view_obj=fv
-            )
-
-        # Assert
-        assert str(e_info.value) == Query.ERROR_MESSAGE_FEATURE_AMBIGUOUS.format(
-            "tf_name"
-        )
-
-    def test_attach_transformation_fn_fv_labels(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("label")],
+            id=11,
+            stream=False,
         )
 
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["tf1_name"] = tf
-
-        fv = feature_view.FeatureView(
+        td = training_dataset.TrainingDataset(
             name="test",
-            query=query,
+            version=1,
+            data_format="CSV",
             featurestore_id=99,
-            transformation_functions=transformation_fn_dict,
-            labels=["tf_name"],
-        )
-
-        # Act
-        with pytest.raises(ValueError) as e_info:
-            tf_engine.attach_transformation_fn(
-                training_dataset_obj=None, feature_view_obj=fv
-            )
-
-        # Assert
-        assert (
-            str(e_info.value)
-            == "Online transformations for training dataset labels are not supported."
+            splits={"train": 0.8, "test": 0.2},
+            id=10,
         )
 
-    def test_is_builtin(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=feature_store_id,
+            query=fg1.select_all(),
+            transformation_functions=[tf1],
         )
 
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            builtin_source_code="",
-            output_type="str",
-            name="tf_name",
-            version=1,
-        )
+        dataset = pd.DataFrame()
 
         # Act
-        result = tf_engine.is_builtin(transformation_fn_instance=tf)
-
-        # Assert
-        assert result is False
-
-    def test_is_builtin_min_max_scaler(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            builtin_source_code="",
-            output_type="str",
-            name="min_max_scaler",
-            version=1,
+        tf_engine.compute_and_set_feature_statistics(
+            training_dataset=td, feature_view_obj=fv, dataset=dataset
         )
 
-        # Act
-        result = tf_engine.is_builtin(transformation_fn_instance=tf)
-
         # Assert
-        assert result is True
-
-    def test_is_builtin_min_max_scaler_version(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            builtin_source_code="",
-            output_type="str",
-            name="min_max_scaler",
-            version=2,
+        assert (
+            mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
+            == 1
         )
-
-        # Act
-        result = tf_engine.is_builtin(transformation_fn_instance=tf)
-
-        # Assert
-        assert result is False
-
-    def test_is_builtin_standard_scaler(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            builtin_source_code="",
-            output_type="str",
-            name="standard_scaler",
-            version=1,
-        )
-
-        # Act
-        result = tf_engine.is_builtin(transformation_fn_instance=tf)
-
-        # Assert
-        assert result is True
-
-    def test_is_builtin_robust_scaler(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            builtin_source_code="",
-            output_type="str",
-            name="robust_scaler",
-            version=1,
-        )
-
-        # Act
-        result = tf_engine.is_builtin(transformation_fn_instance=tf)
-
-        # Assert
-        assert result is True
-
-    def test_is_builtin_label_encoder(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            builtin_source_code="",
-            output_type="str",
-            name="label_encoder",
-            version=1,
-        )
-
-        # Act
-        result = tf_engine.is_builtin(transformation_fn_instance=tf)
-
-        # Assert
-        assert result is True
-
-    def test_populate_builtin_fn_arguments(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def tf_name():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, transformation_fn=tf_name, output_type="str"
-        )
-
-        # Act
-        with pytest.raises(ValueError) as e_info:
-            tf_engine.populate_builtin_fn_arguments(
-                feature_name=None,
-                transformation_function_instance=tf,
-                feature_descriptive_stats=None,
-            )
-
-        # Assert
-        assert str(e_info.value) == "Not implemented"
-
-    def test_populate_builtin_fn_arguments_min_max_scaler(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch(
-            "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.min_max_scaler_stats",
-            return_value=(1, 100),
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def min_max_scaler():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, transformation_fn=min_max_scaler, output_type="str"
-        )
-
-        # Act
-        tf_engine.populate_builtin_fn_arguments(
-            feature_name=None,
-            transformation_function_instance=tf,
-            feature_descriptive_stats=None,
-        )
-
-        # Assert
-        assert tf.transformation_fn.keywords["min_value"] == 1
-        assert tf.transformation_fn.keywords["max_value"] == 100
-
-    def test_populate_builtin_fn_arguments_standard_scaler(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch(
-            "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.standard_scaler_stats",
-            return_value=(1, 100),
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def standard_scaler():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, transformation_fn=standard_scaler, output_type="str"
-        )
-
-        # Act
-        tf_engine.populate_builtin_fn_arguments(
-            feature_name=None,
-            transformation_function_instance=tf,
-            feature_descriptive_stats=None,
-        )
-
-        # Assert
-        assert tf.transformation_fn.keywords["mean"] == 1
-        assert tf.transformation_fn.keywords["std_dev"] == 100
-
-    def test_populate_builtin_fn_arguments_robust_scaler(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch(
-            "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.robust_scaler_stats",
-            return_value={24: 1, 49: 2, 74: 3},
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def robust_scaler():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, transformation_fn=robust_scaler, output_type="str"
-        )
-
-        # Act
-        tf_engine.populate_builtin_fn_arguments(
-            feature_name=None,
-            transformation_function_instance=tf,
-            feature_descriptive_stats=None,
-        )
-
-        # Assert
-        assert tf.transformation_fn.keywords["p25"] == 1
-        assert tf.transformation_fn.keywords["p50"] == 2
-        assert tf.transformation_fn.keywords["p75"] == 3
-
-    def test_populate_builtin_fn_arguments_label_encoder(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch(
-            "hsfs.core.builtin_transformation_function.BuiltInTransformationFunction.encoder_stats",
-            return_value="test",
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def label_encoder():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id, transformation_fn=label_encoder, output_type="str"
-        )
-
-        # Act
-        tf_engine.populate_builtin_fn_arguments(
-            feature_name=None,
-            transformation_function_instance=tf,
-            feature_descriptive_stats=None,
-        )
-
-        # Assert
-        assert tf.transformation_fn.keywords["value_to_index"] == "test"
-
-    def test_populate_builtin_attached_fns(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin",
-            return_value=False,
-        )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_fn_arguments"
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf_attached = transformation_function_attached.TransformationFunctionAttached(
-            name="tf_name", transformation_function=testFunction
-        )
-        tf1_attached = transformation_function_attached.TransformationFunctionAttached(
-            name="tf1_name", transformation_function=testFunction
-        )
-
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf_attached
-        transformation_fn_dict["tf1_name"] = tf1_attached
-
-        # Act
-        tf_engine.populate_builtin_attached_fns(
-            attached_transformation_fns=transformation_fn_dict,
-            feature_descriptive_stats=None,
-        )
-
-        # Assert
-        assert transformation_fn_dict["tf_name"] == tf_attached
-        assert transformation_fn_dict["tf1_name"] == tf1_attached
-
-    def test_populate_builtin_attached_fns_is_builtin(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin"
-        )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_fn_arguments"
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf_attached = transformation_function_attached.TransformationFunctionAttached(
-            name="tf_name", transformation_function=testFunction
-        )
-        tf1_attached = transformation_function_attached.TransformationFunctionAttached(
-            name="tf1_name", transformation_function=testFunction
-        )
-
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf_attached
-        transformation_fn_dict["tf1_name"] = tf1_attached
-
-        # Act
-        tf_engine.populate_builtin_attached_fns(
-            attached_transformation_fns=transformation_fn_dict,
-            feature_descriptive_stats=None,
-        )
-
-        # Assert
-        assert transformation_fn_dict["tf_name"] != tf_attached
-        assert transformation_fn_dict["tf1_name"] != tf1_attached
-
-    def test_infer_spark_type_string_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(str)
-
-        # Assert
-        assert result == "STRING"
-
-    def test_infer_spark_type_string_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("str")
-
-        # Assert
-        assert result == "STRING"
-
-    def test_infer_spark_type_string_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("string")
-
-        # Assert
-        assert result == "STRING"
-
-    def test_infer_spark_type_byte_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(bytes)
-        result1 = tf_engine.infer_spark_type("BinaryType()")
-
-        # Assert
-        assert result == "BINARY"
-        assert result1 == "BINARY"
-
-    def test_infer_spark_type_int8_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(numpy.int8)
-
-        # Assert
-        assert result == "BYTE"
-
-    def test_infer_spark_type_int8_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("int8")
-
-        # Assert
-        assert result == "BYTE"
-
-    def test_infer_spark_type_int8_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("byte")
-        result1 = tf_engine.infer_spark_type("ByteType()")
-
-        # Assert
-        assert result == "BYTE"
-        assert result1 == "BYTE"
-
-    def test_infer_spark_type_int16_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(numpy.int16)
-
-        # Assert
-        assert result == "SHORT"
-
-    def test_infer_spark_type_int16_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("int16")
-
-        # Assert
-        assert result == "SHORT"
-
-    def test_infer_spark_type_int16_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("short")
-        result1 = tf_engine.infer_spark_type("ShortType()")
-
-        # Assert
-        assert result == "SHORT"
-        assert result1 == "SHORT"
-
-    def test_infer_spark_type_int_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(int)
-
-        # Assert
-        assert result == "INT"
-
-    def test_infer_spark_type_int_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("int")
-
-        # Assert
-        assert result == "INT"
-
-    def test_infer_spark_type_int_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(numpy.int32)
-        result1 = tf_engine.infer_spark_type("IntegerType()")
-
-        # Assert
-        assert result == "INT"
-        assert result1 == "INT"
-
-    def test_infer_spark_type_int64_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(numpy.int64)
-
-        # Assert
-        assert result == "LONG"
-
-    def test_infer_spark_type_int64_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("int64")
-
-        # Assert
-        assert result == "LONG"
-
-    def test_infer_spark_type_int64_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("long")
-
-        # Assert
-        assert result == "LONG"
-
-    def test_infer_spark_type_int64_type_4(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("bigint")
-        result1 = tf_engine.infer_spark_type("LongType()")
-
-        # Assert
-        assert result == "LONG"
-        assert result1 == "LONG"
-
-    def test_infer_spark_type_float_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(float)
-
-        # Assert
-        assert result == "FLOAT"
-
-    def test_infer_spark_type_float_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("float")
-        result1 = tf_engine.infer_spark_type("FloatType()")
-
-        # Assert
-        assert result == "FLOAT"
-        assert result1 == "FLOAT"
-
-    def test_infer_spark_type_double_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(numpy.float64)
-
-        # Assert
-        assert result == "DOUBLE"
-
-    def test_infer_spark_type_double_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("float64")
-
-        # Assert
-        assert result == "DOUBLE"
-
-    def test_infer_spark_type_double_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("double")
-        result1 = tf_engine.infer_spark_type("DoubleType()")
-
-        # Assert
-        assert result == "DOUBLE"
-        assert result1 == "DOUBLE"
-
-    def test_infer_spark_type_timestamp_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(datetime.datetime)
-
-        # Assert
-        assert result == "TIMESTAMP"
-
-    def test_infer_spark_type_timestamp_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(numpy.datetime64)
-        result1 = tf_engine.infer_spark_type("TimestampType()")
-
-        # Assert
-        assert result == "TIMESTAMP"
-        assert result1 == "TIMESTAMP"
-
-    def test_infer_spark_type_date_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(datetime.date)
-        result1 = tf_engine.infer_spark_type("DateType()")
-
-        # Assert
-        assert result == "DATE"
-        assert result1 == "DATE"
-
-    def test_infer_spark_type_bool_type_1(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type(bool)
-
-        # Assert
-        assert result == "BOOLEAN"
-
-    def test_infer_spark_type_bool_type_2(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("boolean")
-
-        # Assert
-        assert result == "BOOLEAN"
-
-    def test_infer_spark_type_bool_type_3(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        result = tf_engine.infer_spark_type("bool")
-        result1 = tf_engine.infer_spark_type("BooleanType()")
-
-        # Assert
-        assert result == "BOOLEAN"
-        assert result1 == "BOOLEAN"
-
-    def test_infer_spark_type_wrong_type(self):
-        # Arrange
-        feature_store_id = 99
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        # Act
-        with pytest.raises(TypeError) as e_info:
-            tf_engine.infer_spark_type("wrong")
-
-        # Assert
-        assert str(e_info.value) == "Not supported type wrong."
-
-    def test_compute_transformation_fn_statistics(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-        mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine")
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        td = training_dataset.TrainingDataset(
-            name="test",
-            version=1,
-            data_format="CSV",
-            featurestore_id=99,
-            splits={},
-            id=10,
-        )
-
-        # Act
-        tf_engine.compute_transformation_fn_statistics(
-            training_dataset_obj=td,
-            builtin_tffn_features=None,
-            label_encoder_features=None,
-            feature_dataframe=None,
-            feature_view_obj=None,
-        )
-
-        # Assert
-        assert (
-            mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
-            == 1
-        )
-
-    def test_populate_builtin_transformation_functions(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin"
-        )
-        mock_tf_engine_compute_transformation_fn_statistics = mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.compute_transformation_fn_statistics"
-        )
-        mock_tf_engine_populate_builtin_attached_fns = mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_attached_fns"
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        def label_encoder():
-            print("Test")
-
-        tf_label_encoder = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=label_encoder,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["label_encoder"] = tf_label_encoder
-
-        td = training_dataset.TrainingDataset(
-            name="test",
-            version=1,
-            data_format="CSV",
-            featurestore_id=feature_store_id,
-            splits={},
-            id=10,
-            transformation_functions=transformation_fn_dict,
-        )
-
-        dataset = mocker.Mock()
-
-        # Act
-        tf_engine.populate_builtin_transformation_functions(
-            training_dataset=td, feature_view_obj=None, dataset=dataset
-        )
-
-        # Assert
-        assert mock_tf_engine_compute_transformation_fn_statistics.call_count == 1
-        assert mock_tf_engine_populate_builtin_attached_fns.call_count == 1
-        assert dataset.get.call_count == 0
-
-    def test_populate_builtin_transformation_functions_splits(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.is_builtin"
-        )
-        mock_tf_engine_compute_transformation_fn_statistics = mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.compute_transformation_fn_statistics"
-        )
-        mock_tf_engine_populate_builtin_attached_fns = mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_attached_fns"
-        )
-
-        tf_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        def label_encoder():
-            print("Test")
-
-        tf_label_encoder = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=label_encoder,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["label_encoder"] = tf_label_encoder
-
-        td = training_dataset.TrainingDataset(
-            name="test",
-            version=1,
-            data_format="CSV",
-            featurestore_id=feature_store_id,
-            splits={"key": "value"},
-            id=10,
-            transformation_functions=transformation_fn_dict,
-        )
-
-        dataset = mocker.Mock()
-
-        # Act
-        tf_engine.populate_builtin_transformation_functions(
-            training_dataset=td, feature_view_obj=None, dataset=dataset
-        )
-
-        # Assert
-        assert mock_tf_engine_compute_transformation_fn_statistics.call_count == 1
-        assert mock_tf_engine_populate_builtin_attached_fns.call_count == 1
-        assert dataset.get.call_count == 1
-
-    # Previously in test_feature_view_engine
-    def test_get_fv_attached_transformation_fn(self, mocker):
-        # Arrange
-        feature_store_id = 99
-        mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-        td_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id=feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function_attached.TransformationFunctionAttached(
-            name="tf_name", transformation_function=testFunction
-        )
-
-        mock_fv_api.return_value.get_attached_transformation_fn.return_value = tf
-
-        # Act
-        result = td_engine.get_fv_attached_transformation_fn(
-            fv_name="fv_name", fv_version=1
-        )
-
-        # Assert
-        assert "tf_name" in result
-        assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1
-
-    def test_get_fv_attached_transformation_fn_multiple(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-
-        td_engine = transformation_function_engine.TransformationFunctionEngine(
-            feature_store_id=feature_store_id
-        )
-
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function_attached.TransformationFunctionAttached(
-            name="tf_name", transformation_function=testFunction
-        )
-        tf1 = transformation_function_attached.TransformationFunctionAttached(
-            name="tf1_name", transformation_function=testFunction
-        )
-
-        mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf, tf1]
-
-        # Act
-        result = td_engine.get_fv_attached_transformation_fn(
-            fv_name="fv_name", fv_version=1
-        )
-
-        # Assert
-        assert "tf_name" in result
-        assert "tf1_name" in result
-        assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1
diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py
index 0d1f29f346..5fdea2987f 100644
--- a/python/tests/test_transformation_function.py
+++ b/python/tests/test_transformation_function.py
@@ -15,6 +15,9 @@
 #
 
 
+import pytest
+from hsfs.client.exceptions import FeatureStoreException
+from hsfs.hopsworks_udf import hopsworks_udf
 from hsfs.transformation_function import TransformationFunction
 
 
@@ -168,18 +171,29 @@ def test_from_response_json_list(self, backend_fixtures):
             == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
 
+    def test_transformation_function_definition_no_hopworks_udf(self):
+        def test(col1):
+            return col1 + 1
 
-"""
-    def test_from_response_json_basic_info(self, mocker, backend_fixtures):
-        # Arrange
-        json = backend_fixtures["transformation_function"]["get_basic_info"]["response"]
+        with pytest.raises(FeatureStoreException) as exception:
+            TransformationFunction(
+                featurestore_id=10,
+                hopsworks_udf=test,
+            )
 
-        # Act
-        tf = TransformationFunction.from_response_json(json)
+        assert (
+            str(exception.value)
+            == "Please use the hopsworks_udf decorator when defining transformation functions."
+        )
 
-        # Assert
-        assert tf.id is None
-        assert tf._featurestore_id == 11
-        assert tf.version is None
-        assert tf.hopsworks_udf is None
-"""
+    def test_transformation_function_definition_with_hopworks_udf(self):
+        @hopsworks_udf(int)
+        def test2(col1):
+            return col1 + 1
+
+        tf = TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test2,
+        )
+
+        assert tf.hopsworks_udf == test2

From 853995a5936df14f5585f0a11fe5bb6e127fff39 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 3 May 2024 17:33:29 +0200
Subject: [PATCH 11/58] feature view api formated

---
 python/hsfs/core/feature_view_api.py | 52 ++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py
index 6ff621c7db..1bc6b46115 100644
--- a/python/hsfs/core/feature_view_api.py
+++ b/python/hsfs/core/feature_view_api.py
@@ -73,13 +73,28 @@ def update(self, feature_view_obj: feature_view.FeatureView) -> None:
             data=feature_view_obj.json(),
         )
 
-    def get_by_name(self, name: str) -> feature_view.FeatureView:
+    def get_by_name(self, name: str) -> List[feature_view.FeatureView]:
+        """
+        Get a feature view from the backend using its name.
+
+        # Arguments
+            name `str`: Name of the feature view.
+
+        # Returns
+            `List[FeatureView]`: A list that contains all version of the feature view.
+
+        # Raises
+            `RestAPIError`: If the feature view cannot be found from the backend.
+            `ValueError`: If the feature group associated with the feature view cannot be found.
+        """
         path = self._base_path + [name]
         try:
             return [
                 feature_view.FeatureView.from_response_json(fv)
                 for fv in self._client._send_request(
-                    self._GET, path, {"expand": ["query", "features"]}
+                    self._GET,
+                    path,
+                    {"expand": ["query", "features", "transformationfunctions"]},
                 )["items"]
             ]
         except RestAPIError as e:
@@ -93,6 +108,20 @@ def get_by_name(self, name: str) -> feature_view.FeatureView:
                 raise e
 
     def get_by_name_version(self, name: str, version: int) -> feature_view.FeatureView:
+        """
+        Get a feature view form the backend using both name and version
+
+        # Arguments
+            name `str`: Name of feature view.
+            version `version`: Version of the feature view.
+
+        # Returns
+            `FeatureView`
+
+        # Raises
+            `RestAPIError`: If the feature view cannot be found from the backend.
+            `ValueError`: If the feature group associated with the feature view cannot be found.
+        """
         path = self._base_path + [name, self._VERSION, version]
         try:
             return feature_view.FeatureView.from_response_json(
@@ -179,10 +208,21 @@ def get_serving_prepared_statement(
 
     def get_attached_transformation_fn(
         self, name: str, version: int
-    ) -> Union[
-        "transformation_function.TransformationFunction",
-        List["transformation_function.TransformationFunction"],
-    ]:
+    ) -> List["transformation_function.TransformationFunction"]:
+        """
+        Get transformation functions attached to a feature view form the backend
+
+        # Arguments
+            name `str`: Name of feature view.
+            version `ìnt`: Version of feature view.
+
+        # Returns
+            `List[TransformationFunction]` : List of transformation functions attached to the feature view.
+
+        # Raises
+            `RestAPIError`: If the feature view cannot be found from the backend.
+            `ValueError`: If the feature group associated with the feature view cannot be found.
+        """
         path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION]
         return transformation_function.TransformationFunction.from_response_json(
             self._client._send_request("GET", path)

From b4a37afe2201a7f0959835507450d92d38af8c23 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sat, 4 May 2024 13:40:04 +0200
Subject: [PATCH 12/58] reformatting and fixing feature_view_engine

---
 python/hsfs/core/feature_view_engine.py       | 79 ++++++++++++---
 python/tests/core/test_feature_view_engine.py | 96 ++++++++++++-------
 2 files changed, 126 insertions(+), 49 deletions(-)

diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index e954701d8e..3305c0e209 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -17,7 +17,7 @@
 
 import datetime
 import warnings
-from typing import Optional
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from hsfs import (
     client,
@@ -37,11 +37,15 @@
     statistics_engine,
     tags_api,
     training_dataset_engine,
-    transformation_function_engine,
 )
 from hsfs.training_dataset_split import TrainingDatasetSplit
 
 
+if TYPE_CHECKING:
+    from hsfs.feature_view import FeatureView
+    from hsfs.transformation_function import TransformationFunction
+
+
 class FeatureViewEngine:
     ENTITY_TYPE = "featureview"
     _TRAINING_DATA_API_PATH = "trainingdatasets"
@@ -53,11 +57,6 @@ def __init__(self, feature_store_id):
 
         self._feature_view_api = feature_view_api.FeatureViewApi(feature_store_id)
         self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
-        self._transformation_function_engine = (
-            transformation_function_engine.TransformationFunctionEngine(
-                feature_store_id
-            )
-        )
         self._td_code_engine = code_engine.CodeEngine(
             feature_store_id, self._TRAINING_DATA_API_PATH
         )
@@ -69,7 +68,16 @@ def __init__(self, feature_store_id):
         )
         self._query_constructor_api = query_constructor_api.QueryConstructorApi()
 
-    def save(self, feature_view_obj):
+    def save(self, feature_view_obj: FeatureView) -> FeatureView:
+        """
+        Save a feature view to the backend.
+
+        # Arguments
+            feature_view_obj `FeatureView` : The feature view object to be saved.
+
+        # Returns
+            `FeatureView` : Updated feature view that has the ID used to save in the backend.
+        """
         if feature_view_obj.query.is_time_travel():
             warnings.warn(
                 "`as_of` argument in the `Query` will be ignored because"
@@ -120,8 +128,6 @@ def save(self, feature_view_obj):
                     )
                 )
 
-        # TODO : Remove this code portion attaches a transfromation function to a feature. This is not possible with the current implementation
-
         updated_fv = self._feature_view_api.post(feature_view_obj)
         print(
             "Feature view created successfully, explore it at \n"
@@ -129,11 +135,38 @@ def save(self, feature_view_obj):
         )
         return updated_fv
 
-    def update(self, feature_view_obj):
+    def update(self, feature_view_obj: FeatureView) -> FeatureView:
+        """
+        Update the feature view object saved in the backend
+
+        # Arguments
+            feature_view_obj `FeatureView` : The feature view object to be saved.
+
+        # Returns
+            `FeatureView` : Updated feature view that has the ID used to save in the backend.
+        """
         self._feature_view_api.update(feature_view_obj)
         return feature_view_obj
 
-    def get(self, name, version=None):
+    def get(
+        self, name: str, version: int = None
+    ) -> Union[FeatureView, List[FeatureView]]:
+        """
+        Get a feature view form the backend using name or using name and version.
+
+        If version is not provided then a List of feature views containing all of its versions is returned.
+
+        # Arguments
+            name `str`: Name of feature view.
+            version `version`: Version of the feature view.
+
+        # Returns
+            `Union[FeatureView, List[FeatureView]]`
+
+        # Raises
+            `RestAPIError`: If the feature view cannot be found from the backend.
+            `ValueError`: If the feature group associated with the feature view cannot be found.
+        """
         if version:
             fv = self._feature_view_api.get_by_name_version(name, version)
         else:
@@ -232,6 +265,28 @@ def get_batch_query_string(
             return fs_query.pit_query
         return fs_query.query
 
+    def get_attached_transformation_fn(
+        self, name: str, version: int
+    ) -> List[TransformationFunction]:
+        """
+        Get transformation functions attached to a feature view form the backend
+
+        # Arguments
+            name `str`: Name of feature view.
+            version `ìnt`: Version of feature view.
+
+        # Returns
+            `List[TransformationFunction]` : List of transformation functions attached to the feature view.
+
+        # Raises
+            `RestAPIError`: If the feature view cannot be found from the backend.
+            `ValueError`: If the feature group associated with the feature view cannot be found.
+        """
+        transformation_functions = (
+            self._feature_view_api.get_attached_transformation_fn(name, version)
+        )
+        return transformation_functions
+
     def create_training_dataset(
         self,
         feature_view_obj,
diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py
index d8410aa21e..e50868285d 100644
--- a/python/tests/core/test_feature_view_engine.py
+++ b/python/tests/core/test_feature_view_engine.py
@@ -23,14 +23,15 @@
     feature_view,
     split_statistics,
     training_dataset,
-    transformation_function_attached,
 )
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.constructor import fs_query
 from hsfs.constructor.query import Query
 from hsfs.core import arrow_flight_client, feature_view_engine
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
+from hsfs.hopsworks_udf import hopsworks_udf
 from hsfs.storage_connector import BigQueryConnector, StorageConnector
+from hsfs.transformation_function import TransformationFunction
 
 
 engine.init("python")
@@ -95,9 +96,6 @@ def test_save(self, mocker):
             "hsfs.core.feature_view_engine.FeatureViewEngine._get_feature_view_url",
             return_value=feature_view_url,
         )
-        mock_attach_transformation = mocker.patch(
-            "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function",
-        )
         mock_print = mocker.patch("builtins.print")
 
         fv_engine = feature_view_engine.FeatureViewEngine(
@@ -113,7 +111,6 @@ def test_save(self, mocker):
 
         # Assert
         assert mock_fv_api.return_value.post.call_count == 1
-        assert mock_attach_transformation.call_count == 1
         assert mock_print.call_count == 1
         assert mock_print.call_args[0][
             0
@@ -353,10 +350,7 @@ def test_get_name(self, mocker):
 
         mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
         mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn"
-        )
-        mock_attach_transformation = mocker.patch(
-            "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function",
+            "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn"
         )
 
         fv_engine = feature_view_engine.FeatureViewEngine(
@@ -385,7 +379,6 @@ def test_get_name(self, mocker):
 
         # Assert
         assert mock_fv_api.return_value.get_by_name_version.call_count == 0
-        assert mock_attach_transformation.call_count == 2
         assert mock_fv_api.return_value.get_by_name.call_count == 1
         assert len(result) == 2
 
@@ -395,10 +388,7 @@ def test_get_name_version(self, mocker):
 
         mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
         mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn"
-        )
-        mock_attach_transformation = mocker.patch(
-            "hsfs.core.feature_view_engine.FeatureViewEngine.attach_transformation_function",
+            "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn"
         )
 
         fv_engine = feature_view_engine.FeatureViewEngine(
@@ -420,7 +410,6 @@ def test_get_name_version(self, mocker):
 
         # Assert
         assert mock_fv_api.return_value.get_by_name_version.call_count == 1
-        assert mock_attach_transformation.call_count == 1
         assert mock_fv_api.return_value.get_by_name.call_count == 0
 
     def test_delete_name(self, mocker):
@@ -566,40 +555,73 @@ def test_get_batch_query_string_pit_query(self, mocker):
         assert mock_fv_api.return_value.get_batch_query.call_count == 1
         assert mock_qc_api.return_value.construct_query.call_count == 1
 
-    def test_attach_transformation_function(self, mocker):
-        def testFunction():
-            print("Test")
+    def test_get_attached_transformation_fn(self, mocker):
+        # Arrange
+        feature_store_id = 99
 
-        tf = transformation_function_attached.TransformationFunctionAttached(
-            name="tf_name", transformation_function=testFunction
+        mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
+
+        fv_engine = feature_view_engine.FeatureViewEngine(
+            feature_store_id=feature_store_id
         )
-        mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.get_fv_attached_transformation_fn",
-            return_value={"label": tf},
+
+        @hopsworks_udf(int)
+        def test2(col1):
+            return col1 + 1
+
+        tf = TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test2,
         )
+
+        mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf]
+
+        # Act
+        result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1)
+
+        # Assert
+        assert result == [tf]
+        assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1
+
+    def test_get_attached_transformation_fn_multiple(self, mocker):
+        # Arrange
         feature_store_id = 99
+
+        mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
+
         fv_engine = feature_view_engine.FeatureViewEngine(
             feature_store_id=feature_store_id
         )
-        fv = feature_view.FeatureView(
-            name="fv_name",
-            version=1,
-            query=query,
-            featurestore_id=feature_store_id,
+
+        @hopsworks_udf(int)
+        def test1(col1):
+            return col1 + 1
+
+        tf1 = TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test1,
+        )
+
+        @hopsworks_udf(int)
+        def test2(col1):
+            return col1 + 2
+
+        tf2 = TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test2,
         )
-        fv.schema = query.features
+
+        mock_fv_api.return_value.get_attached_transformation_fn.return_value = [
+            tf1,
+            tf2,
+        ]
 
         # Act
-        fv_engine.attach_transformation_function(fv)
+        result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1)
 
         # Assert
-        id_feature = fv.schema[0]
-        label_feature = fv.schema[1]
-        assert id_feature.name == "id"
-        assert id_feature.transformation_function is None
-        assert label_feature.name == "label"
-        assert label_feature.transformation_function == tf
+        assert result == [tf1, tf2]
+        assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1
 
     def test_create_training_dataset(self, mocker):
         # Arrange

From 2a6250074f8befa8f686f9f17b49607215c8411a Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sat, 4 May 2024 14:43:49 +0200
Subject: [PATCH 13/58] reformatted and added unit tests for feature view

---
 python/hsfs/feature_view.py                   | 61 ++++++++++-----
 .../tests/fixtures/feature_view_fixtures.json |  4 +-
 .../transformation_function_fixtures.json     |  7 +-
 python/tests/test_feature_view.py             | 75 +++++++++++++------
 4 files changed, 95 insertions(+), 52 deletions(-)

diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 386e3b256f..837bc168c2 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -123,19 +123,14 @@ def __init__(
             training_helper_columns if training_helper_columns else []
         )
 
-        # TODO : Clean this up
-        if transformation_functions:
-            for i, transformation_function in enumerate(transformation_functions):
-                if not isinstance(transformation_function, TransformationFunction):
-                    transformation_functions[i] = TransformationFunction(
-                        self.featurestore_id,
-                        hopsworks_udf=transformation_function,
-                        version=1,
-                    )
-
-        self._transformation_functions: List[TransformationFunction] = (
-            transformation_functions
-        )
+        self._transformation_functions: List[TransformationFunction] = [
+            TransformationFunction(
+                self.featurestore_id, hopsworks_udf=transformation_function, version=1
+            )
+            if not isinstance(transformation_function, TransformationFunction)
+            else transformation_function
+            for transformation_function in transformation_functions
+        ]
 
         self._features = []
         self._feature_view_engine: feature_view_engine.FeatureViewEngine = (
@@ -3396,6 +3391,14 @@ def create_feature_monitoring(
 
     @classmethod
     def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
+        """
+        Function that constructs the class object from its json serialization.
+
+        # Arguments
+            json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
+        # Returns
+            `TransformationFunction`: Json deserialized class object.
+        """
         json_decamelized = humps.decamelize(json_dict)
 
         serving_keys = json_decamelized.get("serving_keys", None)
@@ -3403,6 +3406,7 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
             serving_keys = [
                 skm.ServingKey.from_response_json(sk) for sk in serving_keys
             ]
+        transformation_functions = json_decamelized.get("transformation_functions", {})
         fv = cls(
             id=json_decamelized.get("id", None),
             name=json_decamelized["name"],
@@ -3412,12 +3416,11 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
             description=json_decamelized.get("description", None),
             featurestore_name=json_decamelized.get("featurestore_name", None),
             serving_keys=serving_keys,
-            transformation_functions=[
-                TransformationFunction.from_response_json(transformation)
-                for transformation in json_decamelized.get(
-                    "transformation_functions", []
-                )
-            ],
+            transformation_functions=TransformationFunction.from_response_json(
+                transformation_functions
+            )
+            if transformation_functions
+            else [],
         )
         features = json_decamelized.get("features", [])
         if features:
@@ -3439,6 +3442,14 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
         return fv
 
     def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureView":
+        """
+        Function that updates the class object from its json serialization.
+
+        # Arguments
+            json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
+        # Returns
+            `TransformationFunction`: Json deserialized class object.
+        """
         other = self.from_response_json(json_dict)
         for key in [
             "name",
@@ -3480,9 +3491,21 @@ def _init_feature_monitoring_engine(self) -> None:
         )
 
     def json(self) -> str:
+        """
+        Convert class into its json serialized form.
+
+        # Returns
+            `str`: Json serialized object.
+        """
         return json.dumps(self, cls=util.FeatureStoreEncoder)
 
     def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert class into a dictionary.
+
+        # Returns
+            `Dict`: Dictionary that contains all data required to json serialize the object.
+        """
         return {
             "featurestoreId": self._featurestore_id,
             "name": self._name,
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index aabf2bf9f6..92601b46da 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -159,9 +159,7 @@
       "id": 11,
       "version": 1,
       "description": "test_description",
-      "transformation_functions": {
-        "featurestore_id": 5
-      },
+      "transformation_functions": {},
       "features": [
         {
           "name": "intt",
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 98017a07c5..5b8e753508 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -51,14 +51,9 @@
       }
     }
   },
-  "get_basic_info": {
-    "response": {
-      "featurestore_id": 11
-    }
-  },
   "get_list": {
     "response": {
-      "count": 1,
+      "count": 2,
       "items": [
         {
           "id" : 1,
diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py
index 25a1cc6fbe..e8e36c0f1e 100644
--- a/python/tests/test_feature_view.py
+++ b/python/tests/test_feature_view.py
@@ -15,9 +15,10 @@
 #
 import warnings
 
-from hsfs import feature_view, training_dataset_feature, transformation_function
+from hsfs import feature_view, training_dataset_feature
 from hsfs.constructor import fs_query, query
 from hsfs.feature_store import FeatureStore
+from hsfs.hopsworks_udf import hopsworks_udf
 
 
 class TestFeatureView:
@@ -32,7 +33,6 @@ def test_from_response_json(self, mocker, backend_fixtures):
         mocker.patch("hsfs.engine.get_type")
         mocker.patch("hsfs.core.feature_store_api.FeatureStoreApi.get")
         json = backend_fixtures["feature_view"]["get"]["response"]
-
         # Act
         fv = feature_view.FeatureView.from_response_json(json)
 
@@ -44,7 +44,7 @@ def test_from_response_json(self, mocker, backend_fixtures):
         assert fv.version == 1
         assert fv.description == "test_description"
         assert fv.labels == ["intt"]
-        assert fv.transformation_functions == {}
+        assert fv.transformation_functions == []
         assert len(fv.schema) == 2
         assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature)
 
@@ -65,10 +65,50 @@ def test_from_response_json_basic_info(self, mocker, backend_fixtures):
         assert fv.version is None
         assert fv.description is None
         assert fv.labels == []
-        assert fv.transformation_functions == {}
+        assert fv.transformation_functions == []
         assert len(fv.schema) == 0
         assert fv.query._left_feature_group.deprecated is False
 
+    def test_from_response_json_transformation_function(self, mocker, backend_fixtures):
+        # Arrange
+        mocker.patch.object(
+            FeatureStore,
+            "project_id",
+            return_value=99,
+        )
+        mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.engine.get_type")
+        mocker.patch("hsfs.core.feature_store_api.FeatureStoreApi.get")
+        json = backend_fixtures["feature_view"]["get_transformations"]["response"]
+        # Act
+        fv = feature_view.FeatureView.from_response_json(json)
+
+        # Assert
+        assert fv.name == "test_name"
+        assert fv.id == 11
+        assert isinstance(fv.query, query.Query)
+        assert fv.featurestore_id == 5
+        assert fv.version == 1
+        assert fv.description == "test_description"
+        assert fv.labels == ["intt"]
+        assert len(fv.transformation_functions) == 2
+        assert (
+            fv.transformation_functions[0].hopsworks_udf.function_name == "add_mean_fs"
+        )
+        assert (
+            fv.transformation_functions[1].hopsworks_udf.function_name == "add_one_fs"
+        )
+        assert (
+            fv.transformation_functions[0].hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+        )
+        assert (
+            fv.transformation_functions[1].hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
+        )
+        assert len(fv.schema) == 2
+        assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature)
+
     def test_from_response_json_basic_info_deprecated(self, mocker, backend_fixtures):
         # Arrange
         mocker.patch("hsfs.engine.get_type")
@@ -87,7 +127,7 @@ def test_from_response_json_basic_info_deprecated(self, mocker, backend_fixtures
         assert fv.version is None
         assert fv.description is None
         assert fv.labels == []
-        assert fv.transformation_functions == {}
+        assert fv.transformation_functions == []
         assert len(fv.schema) == 0
         assert fv.query._left_feature_group.deprecated is True
         assert len(warning_record) == 1
@@ -104,31 +144,18 @@ def test_transformation_function_instances(self, mocker, backend_fixtures):
         # Act
         q = fs_query.FsQuery.from_response_json(json)
 
-        def testFunction():
-            print("Test")
-
-        tf = transformation_function.TransformationFunction(
-            feature_store_id,
-            transformation_fn=testFunction,
-            builtin_source_code="",
-            output_type="str",
-        )
-
-        transformation_fn_dict = dict()
-        transformation_fn_dict["tf_name"] = tf
-        transformation_fn_dict["tf1_name"] = tf
+        @hopsworks_udf(int)
+        def test(col1):
+            return col1 + 1
 
         fv = feature_view.FeatureView(
             featurestore_id=feature_store_id,
             name="test_fv",
             version=1,
             query=q,
-            transformation_functions=transformation_fn_dict,
+            transformation_functions=[test("data1"), test("data2")],
         )
 
-        updated_transformation_fn_dict = fv.transformation_functions
+        transformation_functions = fv.transformation_functions
 
-        assert (
-            updated_transformation_fn_dict["tf_name"]
-            != updated_transformation_fn_dict["tf1_name"]
-        )
+        assert transformation_functions[0] != transformation_functions[1]

From 35d72dc298c1d6816bdec74f87d279f1f852ee0f Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sat, 4 May 2024 15:09:24 +0200
Subject: [PATCH 14/58] updating documentation for feature store

---
 python/hsfs/feature_store.py | 64 +++++++++++-------------------------
 1 file changed, 20 insertions(+), 44 deletions(-)

diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 24033bf11b..e2ee0f9cc9 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -22,7 +22,6 @@
 
 import great_expectations as ge
 import humps
-import numpy
 import numpy as np
 import pandas as pd
 import polars as pl
@@ -1283,35 +1282,20 @@ def create_training_dataset(
     def create_transformation_function(
         self,
         transformation_function: callable,
-        output_type: Union[
-            str,
-            bytes,
-            int,
-            numpy.int8,
-            numpy.int16,
-            numpy.int32,
-            numpy.int64,
-            float,
-            numpy.float64,
-            datetime.datetime,
-            numpy.datetime64,
-            datetime.date,
-            bool,
-        ],
         version: Optional[int] = None,
     ) -> "TransformationFunction":
         """Create a transformation function metadata object.
 
         !!! example
             ```python
-            # define function
+            # define the transformation function as a Hopsworks's UDF
+            @hopsworks_udf(int)
             def plus_one(value):
                 return value + 1
 
             # create transformation function
             plus_one_meta = fs.create_transformation_function(
                     transformation_function=plus_one,
-                    output_type=int,
                     version=1
                 )
 
@@ -1325,8 +1309,7 @@ def plus_one(value):
             call the `save()` method of the transformation function metadata object.
 
         # Arguments
-            transformation_function: callable object.
-            output_type: python or numpy output type that will be inferred as pyspark.sql.types type.
+            transformation_function: Hopsworks UDF.
 
         # Returns:
             `TransformationFunction`: The TransformationFunction metadata object.
@@ -1334,7 +1317,6 @@ def plus_one(value):
         return TransformationFunction(
             featurestore_id=self._id,
             transformation_fn=transformation_function,
-            output_type=output_type,
             version=version,
         )
 
@@ -1392,9 +1374,7 @@ def get_transformation_function(
                 name='feature_view_name',
                 query=query,
                 labels=["target_column"],
-                transformation_functions={
-                    "column_to_transform": min_max_scaler
-                }
+                transformation_functions=[min_max_scaler("feature1")]
             )
             ```
 
@@ -1421,12 +1401,12 @@ def get_transformation_function(
                 name='transactions_view',
                 query=query,
                 labels=["fraud_label"],
-                transformation_functions = {
-                    "category_column": label_encoder,
-                    "weight": robust_scaler,
-                    "age": min_max_scaler,
-                    "salary": standard_scaler
-                }
+                transformation_functions = [
+                    label_encoder("category_column"),
+                    robust_scaler("weight"),
+                    min_max_scaler("age"),
+                    standard_scaler("salary")
+                ]
             )
             ```
 
@@ -1486,11 +1466,13 @@ def create_feature_view(
             # construct the query
             query = fg1.select_all().join(fg2.select_all())
 
-            # get the transformation functions
-            standard_scaler = fs.get_transformation_function(name='standard_scaler')
+            # define the transformation function as a Hopsworks's UDF
+            @hopsworks_udf(int)
+            def plus_one(value):
+                return value + 1
 
-            # construct dictionary of "feature - transformation function" pairs
-            transformation_functions = {col_name: standard_scaler for col_name in df.columns}
+            # construct list of "transformation functions" on features
+            transformation_functions = {plus_one("feature1"), plus_one("feature1"))}
 
             feature_view = fs.create_feature_view(
                 name='air_quality_fv',
@@ -1508,7 +1490,7 @@ def create_feature_view(
             # define query object
             query = ...
 
-            # define dictionary with column names and transformation functions pairs
+            # define list of transformation functions
             mapping_transformers = ...
 
             # create feature view
@@ -1554,10 +1536,7 @@ def create_feature_view(
                 Training helper columns can be optionally fetched with training data. For more details see
                 documentation for feature view's get training data methods.  Defaults to `[], no training helper
                 columns.
-            transformation_functions: A dictionary mapping tansformation functions to
-                to the features they should be applied to before writing out the
-                vector and at inference time. Defaults to `{}`, no
-                transformations.
+            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
 
         # Returns:
             `FeatureView`: The feature view metadata object.
@@ -1632,10 +1611,7 @@ def get_or_create_feature_view(
                 Training helper columns can be optionally fetched with training data. For more details see
                 documentation for feature view's get training data methods.  Defaults to `[], no training helper
                 columns.
-            transformation_functions: A dictionary mapping tansformation functions to
-                to the features they should be applied to before writing out the
-                vector and at inference time. Defaults to `{}`, no
-                transformations.
+            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
 
         # Returns:
             `FeatureView`: The feature view metadata object.
@@ -1655,7 +1631,7 @@ def get_or_create_feature_view(
                     labels=labels or [],
                     inference_helper_columns=inference_helper_columns or [],
                     training_helper_columns=training_helper_columns or [],
-                    transformation_functions=transformation_functions or {},
+                    transformation_functions=transformation_functions or [],
                 )
             else:
                 raise e

From 7ca35fda44cb1b4c9a80943d5cf95421d9338671 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sat, 4 May 2024 15:11:10 +0200
Subject: [PATCH 15/58] updating documentation for feature store

---
 python/hsfs/feature_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index e2ee0f9cc9..10f6a269bc 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -1472,7 +1472,7 @@ def plus_one(value):
                 return value + 1
 
             # construct list of "transformation functions" on features
-            transformation_functions = {plus_one("feature1"), plus_one("feature1"))}
+            transformation_functions = [plus_one("feature1"), plus_one("feature1"))]
 
             feature_view = fs.create_feature_view(
                 name='air_quality_fv',

From 5e377e6ddcdc74947eadfae6cce876479005f560 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sun, 5 May 2024 17:02:24 +0200
Subject: [PATCH 16/58] fixed tests for training datatset features

---
 .../training_dataset_feature_fixtures.json    | 21 +------------------
 python/tests/test_training_dataset_feature.py |  8 +------
 2 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json
index 19a958b970..f48fd0fabd 100644
--- a/python/tests/fixtures/training_dataset_feature_fixtures.json
+++ b/python/tests/fixtures/training_dataset_feature_fixtures.json
@@ -62,26 +62,7 @@
         "timeTravelFormat": "HUDI"
       },
       "feature_group_feature_name": "test_feature_group_feature_name",
-      "label": "test_label",
-      "transformation_function": {
-        "count": 1,
-        "items": [
-          {
-            "featurestore_id": 11,
-            "transformation_fn": null,
-            "version": 1,
-            "name": "test_name",
-            "source_code_content": "test_source_code_content",
-            "builtin_source_code": "test_builtin_source_code",
-            "output_type": "float",
-            "id": 43,
-            "type": "transformationFunctionTDO",
-            "items": [],
-            "count": 0,
-            "href": "test_href"
-          }
-        ]
-      }
+      "label": "test_label"
     }
   },
   "get_fraud_online_training_dataset_features": {
diff --git a/python/tests/test_training_dataset_feature.py b/python/tests/test_training_dataset_feature.py
index 62a30aca5a..dc5af26112 100644
--- a/python/tests/test_training_dataset_feature.py
+++ b/python/tests/test_training_dataset_feature.py
@@ -15,7 +15,7 @@
 #
 
 
-from hsfs import feature_group, training_dataset_feature, transformation_function
+from hsfs import feature_group, training_dataset_feature
 
 
 class TestTrainingDatasetFeature:
@@ -37,11 +37,6 @@ def test_from_response_json(self, backend_fixtures):
             td_feature._feature_group_feature_name == "test_feature_group_feature_name"
         )
         assert td_feature.label == "test_label"
-        assert len(td_feature.transformation_function) == 1
-        assert isinstance(
-            td_feature.transformation_function[0],
-            transformation_function.TransformationFunction,
-        )
 
     def test_from_response_json_basic_info(self, backend_fixtures):
         # Arrange
@@ -61,4 +56,3 @@ def test_from_response_json_basic_info(self, backend_fixtures):
         assert td_feature._feature_group is None
         assert td_feature._feature_group_feature_name is None
         assert td_feature.label is False
-        assert td_feature.transformation_function is None

From fa1203224b105a31df1c68034e0ac4ab7ebe9b9d Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 6 May 2024 09:14:10 +0200
Subject: [PATCH 17/58] reformatted and added unit tests for python engine

---
 python/hsfs/__init__.py                       |   9 ++
 python/hsfs/core/feature_view_engine.py       |   9 +-
 .../core/transformation_function_engine.py    | 129 +++++----------
 python/hsfs/engine/python.py                  | 153 +++++++++++-------
 python/hsfs/feature_view.py                   |  22 ++-
 python/hsfs/hopsworks_udf.py                  |   7 +-
 python/hsfs/transformation_function.py        |   8 +-
 .../test_transformation_function_engine.py    | 106 ++++++++++++
 python/tests/engine/test_python.py            |  92 +++++------
 9 files changed, 323 insertions(+), 212 deletions(-)

diff --git a/python/hsfs/__init__.py b/python/hsfs/__init__.py
index 31efe17c56..d0297cb25e 100644
--- a/python/hsfs/__init__.py
+++ b/python/hsfs/__init__.py
@@ -19,8 +19,17 @@
 import warnings
 
 import nest_asyncio
+from packaging.version import Version
 
 
+try:
+    import pandas as pd
+
+    if Version(pd.__version__) > Version(2.0):
+        os.environ["USE_PYARROW_EXTENSION"] = "1"
+except ImportError:
+    pass  # Empty except block because environment variable "USE_PYARROW_EXTENSION" need not be set if pyarrow cannot be imported or if pandas version is less than 2.0
+
 # Setting polars skip cpu flag to suppress CPU false positive warning messages printed while importing hsfs
 os.environ["POLARS_SKIP_CPU_CHECK"] = "1"
 
diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index 3305c0e209..491be2c95e 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -395,7 +395,12 @@ def get_training_data(
                 spine=spine,
             )
             split_df = engine.get_instance().get_training_data(
-                td_updated, feature_view_obj, query, read_options, dataframe_type
+                td_updated,
+                feature_view_obj,
+                query,
+                read_options,
+                dataframe_type,
+                training_dataset_version,
             )
             self.compute_training_dataset_statistics(
                 feature_view_obj, td_updated, split_df
@@ -720,7 +725,6 @@ def _get_training_dataset_metadata(
         )
         # schema and transformation functions need to be set for writing training data or feature serving
         td.schema = feature_view_obj.schema
-        td.transformation_functions = feature_view_obj.transformation_functions
         return td
 
     def _get_training_datasets_metadata(self, feature_view_obj):
@@ -730,7 +734,6 @@ def _get_training_datasets_metadata(self, feature_view_obj):
         # schema and transformation functions need to be set for writing training data or feature serving
         for td in tds:
             td.schema = feature_view_obj.schema
-            td.transformation_functions = feature_view_obj.transformation_functions
         return tds
 
     def get_training_datasets(self, feature_view_obj):
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 0ad86f0c53..89808b3db1 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
 
 from hsfs import training_dataset
-from hsfs.core import statistics_api, transformation_function_api
+from hsfs.core import transformation_function_api
 
 
 if TYPE_CHECKING:
@@ -63,13 +63,6 @@ def __init__(self, feature_store_id: int):
         self._transformation_function_api: transformation_function_api.TransformationFunctionApi = transformation_function_api.TransformationFunctionApi(
             feature_store_id
         )
-        self._statistics_api: statistics_api.StatisticsApi = (
-            statistics_api.StatisticsApi(
-                feature_store_id, training_dataset.TrainingDataset.ENTITY_TYPE
-            )
-        )
-        self._feature_view_api: Optional["feature_view_api.FeatureViewApi"] = None
-        self._statistics_engine: Optional["statistics_engine.StatisticsEngine"] = None
 
     def save(
         self, transformation_fn_instance: TransformationFunction
@@ -213,92 +206,46 @@ def compute_and_set_feature_statistics(
                 stats.feature_descriptive_statistics
             )
 
-    def get_ready_to_use_transformation_fns(
-        self,
-        entity: Union[hsfs.feature_view.FeatureView, training_dataset.TrainingDataset],
-        training_dataset_version: Optional[int] = None,
-    ) -> Dict[
-        str, hsfs.transformation_function_attached.TransformationFunctionAttached
-    ]:
-        is_feat_view = isinstance(entity, feature_view.FeatureView)
-        if self._feature_view_api is None:
-            self._feature_view_api = feature_view_api.FeatureViewApi(
-                self._feature_store_id
-            )
-        if self._statistics_engine is None:
-            self._statistics_engine = statistics_engine.StatisticsEngine(
-                self._feature_store_id,
-                entity_type="featureview" if is_feat_view else "trainingdataset",
-            )
-        # get attached transformation functions
-        transformation_functions = (
-            self.get_td_transformation_fn(entity)
-            if isinstance(entity, training_dataset.TrainingDataset)
-            else (self.get_fv_attached_transformation_fn(entity.name, entity.version))
-        )
-        is_stat_required = (
-            len(
-                set(self.BUILTIN_FN_NAMES).intersection(
-                    set([tf.name for tf in transformation_functions.values()])
-                )
-            )
-            > 0
+    @staticmethod
+    def get_and_set_feature_statistics(
+        training_dataset: training_dataset.TrainingDataset,
+        feature_view_obj: FeatureView,
+        training_dataset_version: int = None,
+    ) -> None:
+        """
+        Function that gets the transformation statistics computed while creating the training dataset from the backend and assigns it to the hopsworks UDF object.
+
+        The function assigns the statistics computed to hopsworks UDF object so that the statistics can be used when UDF is executed.
+
+        # Argument
+            training_dataset_obj `TrainingDataset`: The training dataset for which the statistics is to be computed.
+            feature_view `FeatureView`: The feature view in which the training data is being created.
+            training_dataset_version `int`: The version of the training dataset for which the statistics is to be retrieved.
+
+        # Raises
+            `ValueError` : If the statistics are not present in the backend.
+        """
+
+        is_stat_required = any(
+            [
+                tf.hopsworks_udf.statistics_required
+                for tf in feature_view_obj.transformation_functions
+            ]
         )
-        if not is_stat_required:
-            td_tffn_stats = None
-        else:
-            # if there are any built-in transformation functions get related statistics and
-            # populate with relevant arguments
-            # there should be only one statistics object with before_transformation=true
-            if is_feat_view and training_dataset_version is None:
-                raise ValueError(
-                    "Training data version is required for transformation. Call `feature_view.init_serving(version)` "
-                    "or `feature_view.init_batch_scoring(version)` to pass the training dataset version."
-                    "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`."
-                )
-            td_tffn_stats = self._statistics_engine.get(
-                entity,
+
+        if is_stat_required:
+            td_tffn_stats = training_dataset._statistics_engine.get(
+                feature_view_obj,
                 before_transformation=True,
                 training_dataset_version=training_dataset_version,
             )
 
-        if is_stat_required and td_tffn_stats is None:
-            raise ValueError(
-                "No statistics available for initializing transformation functions."
-                + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`."
-            )
-
-        transformation_fns = self.populate_builtin_attached_fns(
-            transformation_functions,
-            td_tffn_stats.feature_descriptive_statistics
-            if td_tffn_stats is not None
-            else None,
-        )
-        return transformation_fns
+            if td_tffn_stats is None:
+                raise ValueError(
+                    "No statistics available for initializing transformation functions."
+                )
 
-    def get_fv_attached_transformation_fn(
-        self, fv_name: str, fv_version: int
-    ) -> Dict[str, "transformation_function_attached.TransformationFunctionAttached"]:
-        if self._feature_view_api is None:
-            self._feature_view_api = feature_view_api.FeatureViewApi(
-                self._feature_store_id
-            )
-            self._statistics_engine = statistics_engine.StatisticsEngine(
-                self._feature_store_id,
-                entity_type="featureview",
-            )
-        transformation_functions = (
-            self._feature_view_api.get_attached_transformation_fn(fv_name, fv_version)
-        )
-        if isinstance(transformation_functions, list):
-            transformation_functions_dict = dict(
-                [
-                    (tf.name, tf.transformation_function)
-                    for tf in transformation_functions
-                ]
-            )
-        else:
-            transformation_functions_dict = {
-                transformation_functions.name: transformation_functions.transformation_function
-            }
-        return transformation_functions_dict
+            for transformation_function in feature_view_obj.transformation_functions:
+                transformation_function.hopsworks_udf.transformation_statistics = (
+                    td_tffn_stats.feature_descriptive_statistics
+                )
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 9754b96997..42814ab079 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -878,7 +878,22 @@ def get_training_data(
         query_obj: query.Query,
         read_options: Dict[str, Any],
         dataframe_type: str,
+        training_dataset_version: int = None,
     ) -> Union[pd.DataFrame, pl.DataFrame]:
+        """
+        Function that creates or retrieves already created the training dataset.
+
+        # Arguments
+            training_dataset_obj `TrainingDataset`: The training dataset metadata object.
+            feature_view_obj `FeatureView`: The feature view object for the which the training data is being created.
+            query_obj `Query`: The query object that contains the query used to create the feature view.
+            read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data.
+            dataframe_type `str`: The type of dataframe returned.
+            training_dataset_version `int`: Version of training data to be retrieved.
+        # Raises
+            `ValueError`: If the training dataset statistics could not be retrieved.
+        """
+
         # dataframe_type of list and numpy are prevented here because statistics needs to be computed from the returned dataframe.
         # The daframe is converted into required types in the function split_labels
         if dataframe_type.lower() not in ["default", "polars", "pandas"]:
@@ -891,15 +906,20 @@ def get_training_data(
                 feature_view_obj,
                 read_options,
                 dataframe_type,
+                training_dataset_version,
             )
         else:
             df = query_obj.read(
                 read_options=read_options, dataframe_type=dataframe_type
             )
-            # TODO : Add statistics
-            transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
-                training_dataset_obj, feature_view_obj, df
-            )
+            if training_dataset_version is None:
+                transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
+                    training_dataset_obj, feature_view_obj, df
+                )
+            else:
+                transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
+                    training_dataset_obj, feature_view_obj, training_dataset_version
+                )
             return self._apply_transformation_function(
                 training_dataset_obj.transformation_functions, df
             )
@@ -934,10 +954,21 @@ def _prepare_transform_split_df(
         feature_view_obj: feature_view.FeatureView,
         read_option: Dict[str, Any],
         dataframe_type: str,
+        training_dataset_version: int = None,
     ) -> Dict[str, Union[pd.DataFrame, pl.DataFrame]]:
         """
         Split a df into slices defined by `splits`. `splits` is a `dict(str, int)` which keys are name of split
         and values are split ratios.
+
+        # Arguments
+            query_obj `Query`: The query object that contains the query used to create the feature view.
+            training_dataset_obj `TrainingDataset`: The training dataset metadata object.
+            feature_view_obj `FeatureView`: The feature view object for the which the training data is being created.
+            read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data.
+            dataframe_type `str`: The type of dataframe returned.
+            training_dataset_version `int`: Version of training data to be retrieved.
+        # Raises
+            `ValueError`: If the training dataset statistics could not be retrieved.
         """
         if (
             training_dataset_obj.splits[0].split_type
@@ -970,11 +1001,14 @@ def _prepare_transform_split_df(
                 training_dataset_obj,
             )
 
-        # apply transformations
-        # 1st parametrise transformation functions with dt split stats
-        transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
-            training_dataset_obj, feature_view_obj, result_dfs
-        )
+        if training_dataset_version is None:
+            transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
+                training_dataset_obj, feature_view_obj, result_dfs
+            )
+        else:
+            transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
+                training_dataset_obj, feature_view_obj, training_dataset_version
+            )
         # and the apply them
         for split_name in result_dfs:
             result_dfs[split_name] = self._apply_transformation_function(
@@ -1153,8 +1187,24 @@ def _create_hive_connection(
     def _return_dataframe_type(
         self, dataframe: Union[pd.DataFrame, pl.DataFrame], dataframe_type: str
     ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]:
-        if dataframe_type.lower() in ["default", "pandas", "polars"]:
+        """
+        Returns a dataframe of particular type.
+
+        # Arguments
+            dataframe `Union[pd.DataFrame, pl.DataFrame]`: Input dataframe
+            dataframe_type `str`: Type of dataframe to be returned
+        # Returns
+            `Union[pd.DataFrame, pl.DataFrame, np.array, list]`: DataFrame of required type.
+        """
+        if dataframe_type.lower() in ["default", "pandas"]:
             return dataframe
+        if dataframe_type.lower() == "polars":
+            if not (
+                isinstance(dataframe, pl.DataFrame) or isinstance(dataframe, pl.Series)
+            ):
+                return pl.from_pandas(dataframe)
+            else:
+                return dataframe
         if dataframe_type.lower() == "numpy":
             return dataframe.values
         if dataframe_type.lower() == "python":
@@ -1235,66 +1285,55 @@ def _apply_transformation_function(
         transformation_functions: List[TransformationFunction],
         dataset: Union[pd.DataFrame, pl.DataFrame],
     ) -> Union[pd.DataFrame, pl.DataFrame]:
+        """
+        Apply transformation function to the dataframe.
+
+        # Arguments
+            transformation_functions `List[TransformationFunction]` : List of transformation functions.
+            dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe.
+        # Raises
+            `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View.
+        """
         transformed_features = set()
+
+        if isinstance(dataset, pl.DataFrame) or isinstance(
+            dataset, pl.dataframe.frame.DataFrame
+        ):
+            # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions.
+            if os.getenv("USE_PYARROW_EXTENSION", False):
+                dataset = dataset.to_pandas(
+                    use_pyarrow_extension_array=True
+                )  # Zero copy if pyarrow extension can be used.
+            else:
+                dataset = dataset.to_pandas(use_pyarrow_extension_array=False)
+
         for transformation_function in transformation_functions:
             hopsworks_udf = transformation_function.hopsworks_udf
             missing_features = set(hopsworks_udf.transformation_features) - set(
                 dataset.columns
             )
-
-            # TODO : Add documentation link in exception
             if missing_features:
                 raise FeatureStoreException(
-                    f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .."
+                    f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
 
             transformed_features.update(
                 transformation_function.hopsworks_udf.transformation_features
             )
-
-            if isinstance(dataset, pl.DataFrame) or isinstance(
-                dataset, pl.dataframe.frame.DataFrame
-            ):
-                pass
-            else:
-                dataset = pd.concat(
-                    [
-                        dataset,
-                        transformation_function.hopsworks_udf.get_udf()(
-                            *(
-                                [
-                                    dataset[feature]
-                                    for feature in transformation_function.hopsworks_udf.transformation_features
-                                ]
-                            )
-                        ),
-                    ],
-                    axis=1,
-                )
-            # TODO : Think about what to do in cases where the output is a polars dataframe.....
-            # if isinstance(dataset, pl.DataFrame) or isinstance(
-            #    dataset, pl.dataframe.frame.DataFrame
-            # ):
-            #    dataset = dataset.with_columns(
-            #        pl.col(feature_name).map_elements(
-            #            transformation_fn.transformation_fn
-            #        )
-            #    )
-            # else:
-
-            # TODO : Think if below code is actually required
-
-            # The below functions is not required for Polars since polars does have object types like pandas
-            # if not (
-            #    isinstance(dataset, pl.DataFrame)
-            #    or isinstance(dataset, pl.dataframe.frame.DataFrame)
-            # ):
-            #    offline_type = Engine.convert_spark_type_to_offline_type(
-            #        transformation_fn.output_type
-            #    )
-            #    dataset[feature_name] = Engine._cast_column_to_offline_type(
-            #        dataset[feature_name], offline_type
-            #    )
+            dataset = pd.concat(
+                [
+                    dataset,
+                    transformation_function.hopsworks_udf.get_udf()(
+                        *(
+                            [
+                                dataset[feature]
+                                for feature in transformation_function.hopsworks_udf.transformation_features
+                            ]
+                        )
+                    ),
+                ],
+                axis=1,
+            )
         dataset = dataset.drop(transformed_features, axis=1)
         return dataset
 
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 837bc168c2..7c8a914dd4 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -123,14 +123,20 @@ def __init__(
             training_helper_columns if training_helper_columns else []
         )
 
-        self._transformation_functions: List[TransformationFunction] = [
-            TransformationFunction(
-                self.featurestore_id, hopsworks_udf=transformation_function, version=1
-            )
-            if not isinstance(transformation_function, TransformationFunction)
-            else transformation_function
-            for transformation_function in transformation_functions
-        ]
+        self._transformation_functions: List[TransformationFunction] = (
+            [
+                TransformationFunction(
+                    self.featurestore_id,
+                    hopsworks_udf=transformation_function,
+                    version=1,
+                )
+                if not isinstance(transformation_function, TransformationFunction)
+                else transformation_function
+                for transformation_function in transformation_functions
+            ]
+            if transformation_functions
+            else []
+        )
 
         self._features = []
         self._feature_view_engine: feature_view_engine.FeatureViewEngine = (
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index b56efb2c5a..34edaf4a64 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -473,6 +473,7 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
                 self._transformation_features, features
             )
         ]
+        udf.output_column_names = udf._get_output_column_names()
         return udf
 
     def get_udf(self) -> Callable:
@@ -497,7 +498,7 @@ def get_udf(self) -> Callable:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert class into a dictionary for json serialization.
+        Convert class into a dictionary.
 
         # Returns
             `Dict`: Dictionary that contains all data required to json serialize the object.
@@ -515,7 +516,7 @@ def to_dict(self) -> Dict[str, Any]:
 
     def json(self) -> str:
         """
-        Json serialize object.
+        Convert class into its json serialized form.
 
         # Returns
             `str`: Json serialized object.
@@ -527,7 +528,7 @@ def from_response_json(
         cls: "HopsworksUdf", json_dict: Dict[str, Any]
     ) -> "HopsworksUdf":
         """
-        Function that deserializes json obtained from the java backend.
+        Function that constructs the class object from its json serialization.
 
         # Arguments
             json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index 0b209bf5c4..4e23853c73 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -137,7 +137,7 @@ def from_response_json(
         cls, json_dict: Dict[str, Any]
     ) -> Union[TransformationFunction, List[TransformationFunction]]:
         """
-        Function that deserializes json obtained from the java backend.
+        Function that constructs the class object from its json serialization.
 
         # Arguments
             json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
@@ -166,7 +166,7 @@ def update_from_response_json(
         self, json_dict: Dict[str, Any]
     ) -> TransformationFunction:
         """
-        Function that updates class based on the response obtained from the java backend.
+        Function that updates the class object from its json serialization.
 
         # Arguments
             json_dict: `Dict[str, Any]`. Json serialized dictionary for the class.
@@ -179,7 +179,7 @@ def update_from_response_json(
 
     def json(self) -> str:
         """
-        Json serialize object.
+        Convert class into its json serialized form.
 
         # Returns
             `str`: Json serialized object.
@@ -188,7 +188,7 @@ def json(self) -> str:
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Convert class into a dictionary for json serialization.
+        Convert class into a dictionary.
 
         # Returns
             `Dict`: Dictionary that contains all data required to json serialize the object.
diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py
index ff3c4f4f85..29e20f3cac 100644
--- a/python/tests/core/test_transformation_function_engine.py
+++ b/python/tests/core/test_transformation_function_engine.py
@@ -366,3 +366,109 @@ def testFunction1(col1):
             mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
             == 1
         )
+
+    def test_get_and_set_feature_statistics_no_statistics_required(self, mocker):
+        feature_store_id = 99
+        mocker.patch("hsfs.client.get_instance")
+        mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine")
+
+        tf_engine = transformation_function_engine.TransformationFunctionEngine(
+            feature_store_id
+        )
+
+        @hopsworks_udf(int)
+        def testFunction1(col1):
+            return col1 + 1
+
+        tf1 = transformation_function.TransformationFunction(
+            feature_store_id,
+            hopsworks_udf=testFunction1,
+        )
+
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("label")],
+            id=11,
+            stream=False,
+        )
+
+        td = training_dataset.TrainingDataset(
+            name="test",
+            version=1,
+            data_format="CSV",
+            featurestore_id=99,
+            splits={"train": 0.8, "test": 0.2},
+            id=10,
+        )
+
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=feature_store_id,
+            query=fg1.select_all(),
+            transformation_functions=[tf1],
+        )
+
+        # Act
+        tf_engine.get_and_set_feature_statistics(
+            training_dataset=td, feature_view_obj=fv, training_dataset_version=1
+        )
+
+        # Assert
+        assert mock_s_engine.return_value.get.call_count == 0
+
+    def test_get_and_set_feature_statistics_statistics_required(self, mocker):
+        feature_store_id = 99
+        mocker.patch("hsfs.client.get_instance")
+        mock_s_engine = mocker.patch("hsfs.core.statistics_engine.StatisticsEngine")
+
+        tf_engine = transformation_function_engine.TransformationFunctionEngine(
+            feature_store_id
+        )
+
+        @hopsworks_udf(int)
+        def testFunction1(col1, statistics_col1):
+            return col1 + statistics_col1.mean
+
+        tf1 = transformation_function.TransformationFunction(
+            feature_store_id,
+            hopsworks_udf=testFunction1,
+        )
+
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("label")],
+            id=11,
+            stream=False,
+        )
+
+        td = training_dataset.TrainingDataset(
+            name="test",
+            version=1,
+            data_format="CSV",
+            featurestore_id=99,
+            splits={"train": 0.8, "test": 0.2},
+            id=10,
+        )
+
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=feature_store_id,
+            query=fg1.select_all(),
+            transformation_functions=[tf1],
+        )
+
+        # Act
+        tf_engine.get_and_set_feature_statistics(
+            training_dataset=td, feature_view_obj=fv, training_dataset_version=1
+        )
+
+        # Assert
+        assert mock_s_engine.return_value.get.call_count == 1
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index 08bc8d52a7..88ff95a34b 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -23,12 +23,12 @@
 import pytest
 from confluent_kafka.admin import PartitionMetadata, TopicMetadata
 from hsfs import (
+    engine,
     feature,
     feature_group,
     feature_view,
     storage_connector,
     training_dataset,
-    transformation_function,
     util,
 )
 from hsfs.client import exceptions
@@ -36,10 +36,14 @@
 from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias
 from hsfs.core import inode, job
 from hsfs.engine import python
+from hsfs.hopsworks_udf import hopsworks_udf
 from hsfs.training_dataset_feature import TrainingDatasetFeature
 from polars.testing import assert_frame_equal as polars_assert_frame_equal
 
 
+engine._engine_type = "python"
+
+
 class TestPython:
     def test_sql(self, mocker):
         # Arrange
@@ -2423,7 +2427,7 @@ def test_split_labels_labels_dataframe_type_polars(self):
         result_df, result_df_split = python_engine.split_labels(
             df=df, dataframe_type="polars", labels="col1"
         )
-        print(type(result_df_split))
+
         # Assert
         assert isinstance(result_df, pl.DataFrame) or isinstance(
             result_df, pl.dataframe.frame.DataFrame
@@ -3233,41 +3237,39 @@ def test_apply_transformation_function_pandas(self, mocker):
 
         python_engine = python.Engine()
 
-        def plus_one(a):
-            return a + 1
+        @hopsworks_udf(int)
+        def plus_one(col1):
+            return col1 + 1
 
-        tf = transformation_function.TransformationFunction(
-            99,
-            transformation_fn=plus_one,
-            builtin_source_code="",
-            output_type="int",
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
         )
 
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-
-        td = training_dataset.TrainingDataset(
-            name="test",
-            version=1,
-            data_format="CSV",
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
             featurestore_id=99,
-            splits={},
-            id=10,
-            transformation_functions=transformation_fn_dict,
+            transformation_functions=[plus_one("tf_name")],
         )
 
         df = pd.DataFrame(data={"tf_name": [1, 2]})
 
         # Act
         result = python_engine._apply_transformation_function(
-            transformation_functions=td.transformation_functions, dataset=df
+            transformation_functions=fv.transformation_functions, dataset=df
         )
 
         # Assert
-        assert len(result["tf_name"]) == 2
-        assert result["tf_name"][0] == 2
-        assert result["tf_name"][1] == 3
+        assert len(result["plus_one_tf_name_"]) == 2
+        assert result["plus_one_tf_name_"][0] == 2
+        assert result["plus_one_tf_name_"][1] == 3
 
     def test_apply_transformation_function_polars(self, mocker):
         # Arrange
@@ -3275,41 +3277,39 @@ def test_apply_transformation_function_polars(self, mocker):
 
         python_engine = python.Engine()
 
-        def plus_one(a):
-            return a + 1
+        @hopsworks_udf(int)
+        def plus_one(col1):
+            return col1 + 1
 
-        tf = transformation_function.TransformationFunction(
-            99,
-            transformation_fn=plus_one,
-            builtin_source_code="",
-            output_type="int",
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
         )
 
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["tf_name"] = tf
-
-        td = training_dataset.TrainingDataset(
-            name="test",
-            version=1,
-            data_format="CSV",
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
             featurestore_id=99,
-            splits={},
-            id=10,
-            transformation_functions=transformation_fn_dict,
+            transformation_functions=[plus_one("tf_name")],
         )
 
         df = pl.DataFrame(data={"tf_name": [1, 2]})
 
         # Act
         result = python_engine._apply_transformation_function(
-            transformation_functions=td.transformation_functions, dataset=df
+            transformation_functions=fv.transformation_functions, dataset=df
         )
 
         # Assert
-        assert len(result["tf_name"]) == 2
-        assert result["tf_name"][0] == 2
-        assert result["tf_name"][1] == 3
+        assert len(result["plus_one_tf_name_"]) == 2
+        assert result["plus_one_tf_name_"][0] == 2
+        assert result["plus_one_tf_name_"][1] == 3
 
     def test_get_unique_values(self):
         # Arrange

From f79a3492010df0f475b97eae3732cca1cb8f3811 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 6 May 2024 22:48:14 +0200
Subject: [PATCH 18/58] most unit tests fixed

---
 python/hsfs/__init__.py                       |   2 +-
 python/hsfs/core/feature_view_engine.py       |   4 +-
 .../hsfs/core/transformation_function_api.py  |  26 --
 .../core/transformation_function_engine.py    |  52 +--
 python/hsfs/core/vector_server.py             |   8 +-
 python/hsfs/engine/python.py                  |  18 +-
 python/hsfs/engine/spark.py                   | 156 +++++----
 python/hsfs/hopsworks_udf.py                  |  44 ++-
 .../hsfs/transformation_function_attached.py  |  71 ----
 python/pyproject.toml                         |   1 +
 python/tests/core/test_arrow_flight_client.py |   3 -
 python/tests/core/test_feature_view_engine.py |   1 -
 .../core/test_training_dataset_engine.py      |   7 +-
 .../test_transformation_function_engine.py    |   4 +-
 python/tests/engine/test_python.py            |  90 ++++-
 python/tests/engine/test_spark.py             | 324 ++++++++++++++----
 python/tests/fixtures/backend_fixtures.py     |   1 -
 python/tests/pyproject.toml                   |   8 +
 .../test_transformation_function_attached.py  |  88 -----
 19 files changed, 539 insertions(+), 369 deletions(-)
 delete mode 100644 python/hsfs/transformation_function_attached.py
 delete mode 100644 python/tests/test_transformation_function_attached.py

diff --git a/python/hsfs/__init__.py b/python/hsfs/__init__.py
index d0297cb25e..82d368d243 100644
--- a/python/hsfs/__init__.py
+++ b/python/hsfs/__init__.py
@@ -25,7 +25,7 @@
 try:
     import pandas as pd
 
-    if Version(pd.__version__) > Version(2.0):
+    if Version(pd.__version__) > Version("2.0"):
         os.environ["USE_PYARROW_EXTENSION"] = "1"
 except ImportError:
     pass  # Empty except block because environment variable "USE_PYARROW_EXTENSION" need not be set if pyarrow cannot be imported or if pandas version is less than 2.0
diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index 491be2c95e..19ea348b97 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -723,7 +723,7 @@ def _get_training_dataset_metadata(
         td = self._feature_view_api.get_training_dataset_by_version(
             feature_view_obj.name, feature_view_obj.version, training_dataset_version
         )
-        # schema and transformation functions need to be set for writing training data or feature serving
+        # schema needs to be set for writing training data or feature serving
         td.schema = feature_view_obj.schema
         return td
 
@@ -731,7 +731,7 @@ def _get_training_datasets_metadata(self, feature_view_obj):
         tds = self._feature_view_api.get_training_datasets(
             feature_view_obj.name, feature_view_obj.version
         )
-        # schema and transformation functions need to be set for writing training data or feature serving
+        # schema needs to be set for writing training data or feature serving
         for td in tds:
             td.schema = feature_view_obj.schema
         return tds
diff --git a/python/hsfs/core/transformation_function_api.py b/python/hsfs/core/transformation_function_api.py
index a0f21f0097..f6692f8f62 100644
--- a/python/hsfs/core/transformation_function_api.py
+++ b/python/hsfs/core/transformation_function_api.py
@@ -19,9 +19,7 @@
 
 from hsfs import (
     client,
-    training_dataset,
     transformation_function,
-    transformation_function_attached,
 )
 
 
@@ -112,27 +110,3 @@ def delete(
         ]
         headers = {"content-type": "application/json"}
         _client._send_request("DELETE", path_params, headers=headers)
-
-    def get_td_transformation_fn(
-        self, training_dataset_instance: training_dataset.TrainingDataset
-    ) -> transformation_function_attached.TransformationFunctionAttached:
-        """
-        Retrieve TransformationFunctionAttached instance
-        Args:
-        training_dataset_instance: TrainingDataset, required
-            training dataset metadata object.
-        """
-        _client = client.get_instance()
-        path_params = [
-            "project",
-            _client._project_id,
-            "featurestores",
-            self._feature_store_id,
-            "trainingdatasets",
-            training_dataset_instance.id,
-            "transformationfunctions",
-        ]
-
-        return transformation_function_attached.TransformationFunctionAttached.from_response_json(
-            _client._send_request("GET", path_params)
-        )
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 89808b3db1..2396cb1a03 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -180,31 +180,35 @@ def compute_and_set_feature_statistics(
             statistics_features.update(
                 transformation_function.hopsworks_udf.statistics_features
             )
+        if statistics_features:
+            # compute statistics on training data
+            if training_dataset.splits:
+                # compute statistics before transformations are applied
+                stats = (
+                    TransformationFunctionEngine.compute_transformation_fn_statistics(
+                        training_dataset,
+                        list(statistics_features),
+                        [],
+                        dataset.get(training_dataset.train_split),
+                        feature_view_obj,
+                    )
+                )
+            else:
+                stats = (
+                    TransformationFunctionEngine.compute_transformation_fn_statistics(
+                        training_dataset,
+                        list(statistics_features),
+                        [],
+                        dataset,
+                        feature_view_obj,
+                    )
+                )
 
-        # compute statistics on training data
-        if training_dataset.splits:
-            # compute statistics before transformations are applied
-            stats = TransformationFunctionEngine.compute_transformation_fn_statistics(
-                training_dataset,
-                list(statistics_features),
-                [],
-                dataset.get(training_dataset.train_split),
-                feature_view_obj,
-            )
-        else:
-            stats = TransformationFunctionEngine.compute_transformation_fn_statistics(
-                training_dataset,
-                list(statistics_features),
-                [],
-                dataset,
-                feature_view_obj,
-            )
-
-        # Set statistics computed in the hopsworks UDF
-        for transformation_function in feature_view_obj.transformation_functions:
-            transformation_function.hopsworks_udf.transformation_statistics = (
-                stats.feature_descriptive_statistics
-            )
+            # Set statistics computed in the hopsworks UDF
+            for transformation_function in feature_view_obj.transformation_functions:
+                transformation_function.hopsworks_udf.transformation_statistics = (
+                    stats.feature_descriptive_statistics
+                )
 
     @staticmethod
     def get_and_set_feature_statistics(
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 2ed6d8688f..c6cd5959bd 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -103,7 +103,7 @@ def __init__(
         self._inference_helper_col_name = [
             feat.name for feat in features if feat.inference_helper_column
         ]
-        self._transformed_feature_vector_col_name = None
+        self._transformed_feature_vector_col_name: List[str] = None
 
         self._skip_fg_ids = skip_fg_ids or set()
         self._serving_keys = serving_keys or []
@@ -1077,9 +1077,9 @@ def default_client(self, default_client: Literal["rest", "sql"]):
 
     def transformed_feature_vector_col_name(self):
         if self._transformed_feature_vector_col_name is None:
+            self._transformed_feature_vector_col_name = self._feature_vector_col_name
             for transformation_function in self._transformation_functions:
-                self._transformed_feature_vector_col_name = (
-                    self._feature_vector_col_name
-                    + transformation_function.hopsworks_udf.transformation_feature_names
+                self._transformed_feature_vector_col_name += (
+                    transformation_function.hopsworks_udf.transformation_features
                 )
         return self._transformed_feature_vector_col_name
\ No newline at end of file
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 42814ab079..6d213f7778 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -912,14 +912,14 @@ def get_training_data(
             df = query_obj.read(
                 read_options=read_options, dataframe_type=dataframe_type
             )
-            if training_dataset_version is None:
-                transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
-                    training_dataset_obj, feature_view_obj, df
-                )
-            else:
-                transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
-                    training_dataset_obj, feature_view_obj, training_dataset_version
-                )
+            # if training_dataset_version is None:
+            transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
+                training_dataset_obj, feature_view_obj, df
+            )
+            # else:
+            #    transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
+            #        training_dataset_obj, feature_view_obj, training_dataset_version
+            #    )
             return self._apply_transformation_function(
                 training_dataset_obj.transformation_functions, df
             )
@@ -1291,6 +1291,8 @@ def _apply_transformation_function(
         # Arguments
             transformation_functions `List[TransformationFunction]` : List of transformation functions.
             dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe.
+        # Returns
+            `DataFrame`: A pandas dataframe with the transformed data.
         # Raises
             `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View.
         """
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index 38867ea81e..f1f6fcb69a 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -23,7 +23,7 @@
 import shutil
 import warnings
 from datetime import date, datetime, timezone
-from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING
+from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING, Dict
 
 import avro
 import numpy as np
@@ -31,6 +31,9 @@
 import tzlocal
 
 if TYPE_CHECKING:
+    from hsfs.constructor.query import Query
+    from hsfs.feature_view import FeatureView
+    from hsfs.training_dataset import TrainingDataset
     from hsfs.transformation_function import TransformationFunction
 
 # in case importing in %%local
@@ -545,12 +548,26 @@ def _online_fg_to_avro(self, feature_group, dataframe):
 
     def get_training_data(
         self,
-        training_dataset,
-        feature_view_obj,
-        query_obj,
-        read_options,
-        dataframe_type,
+        training_dataset: TrainingDataset,
+        feature_view_obj: FeatureView,
+        query_obj: Query,
+        read_options: Dict[str, Any],
+        dataframe_type: str,
+        training_dataset_version: int = None,
     ):
+        """
+        Function that creates or retrieves already created the training dataset.
+
+        # Arguments
+            training_dataset_obj `TrainingDataset`: The training dataset metadata object.
+            feature_view_obj `FeatureView`: The feature view object for the which the training data is being created.
+            query_obj `Query`: The query object that contains the query used to create the feature view.
+            read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data.
+            dataframe_type `str`: The type of dataframe returned.
+            training_dataset_version `int`: Version of training data to be retrieved.
+        # Raises
+            `ValueError`: If the training dataset statistics could not be retrieved.
+        """
         return self.write_training_dataset(
             training_dataset,
             query_obj,
@@ -559,6 +576,7 @@ def get_training_data(
             read_options=read_options,
             to_df=True,
             feature_view_obj=feature_view_obj,
+            training_dataset_version=training_dataset_version,
         )
 
     def split_labels(self, df, labels, dataframe_type):
@@ -581,14 +599,30 @@ def drop_columns(self, df, drop_cols):
 
     def write_training_dataset(
         self,
-        training_dataset,
-        query_obj,
-        user_write_options,
-        save_mode,
-        read_options=None,
-        feature_view_obj=None,
-        to_df=False,
+        training_dataset: TrainingDataset,
+        query_obj: Query,
+        user_write_options: Dict[str, Any],
+        save_mode: str,
+        read_options: Dict[str, Any] = None,
+        feature_view_obj: FeatureView = None,
+        to_df: bool = False,
+        training_dataset_version: Optional[int] = None,
     ):
+        """
+        Function that creates or retrieves already created the training dataset.
+
+        # Arguments
+            training_dataset `TrainingDataset`: The training dataset metadata object.
+            query_obj `Query`: The query object that contains the query used to create the feature view.
+            user_write_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for writing data using spark.
+            save_mode `str`: Spark save mode to be used while writing data.
+            read_options `Dict[str, Any]`: Dictionary that can be used to specify extra parameters for reading data.
+            feature_view_obj `FeatureView`: The feature view object for the which the training data is being created.
+            to_df `bool`: Return dataframe instead of writing the data.
+            training_dataset_version `Optional[int]`: Version of training data to be retrieved.
+        # Raises
+            `ValueError`: If the training dataset statistics could not be retrieved.
+        """
         write_options = self.write_options(
             training_dataset.data_format, user_write_options
         )
@@ -603,14 +637,20 @@ def write_training_dataset(
             else:
                 raise ValueError("Dataset should be a query.")
 
-            transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
+            # if training_dataset_version is None:
+            transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
                 training_dataset, feature_view_obj, dataset
             )
+            # else:
+            #    transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
+            #        training_dataset, feature_view_obj, training_dataset_version
+            #    )
+
             if training_dataset.coalesce:
                 dataset = dataset.coalesce(1)
             path = training_dataset.location + "/" + training_dataset.name
             return self._write_training_dataset_single(
-                training_dataset.transformation_functions,
+                feature_view_obj.transformation_functions,
                 dataset,
                 training_dataset.storage_connector,
                 training_dataset.data_format,
@@ -629,11 +669,22 @@ def write_training_dataset(
 
                 split_dataset[key] = split_dataset[key].cache()
 
-            transformation_function_engine.TransformationFunctionEngine.add_feature_statistics(
-                training_dataset, feature_view_obj, split_dataset
-            )
+            if training_dataset_version is None:
+                transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
+                    training_dataset, feature_view_obj, split_dataset
+                )
+            else:
+                transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
+                    training_dataset, feature_view_obj, training_dataset_version
+                )
+
             return self._write_training_dataset_splits(
-                training_dataset, split_dataset, write_options, save_mode, to_df=to_df
+                training_dataset,
+                split_dataset,
+                write_options,
+                save_mode,
+                to_df=to_df,
+                transformation_functions=feature_view_obj.transformation_functions,
             )
 
     def _split_df(self, query_obj, training_dataset, read_options=None):
@@ -785,11 +836,12 @@ def _write_training_dataset_splits(
         write_options,
         save_mode,
         to_df=False,
+        transformation_functions: List[TransformationFunction] = None,
     ):
         for split_name, feature_dataframe in feature_dataframes.items():
             split_path = training_dataset.location + "/" + str(split_name)
             feature_dataframes[split_name] = self._write_training_dataset_single(
-                training_dataset.transformation_functions,
+                transformation_functions,
                 feature_dataframe,
                 training_dataset.storage_connector,
                 training_dataset.data_format,
@@ -1166,9 +1218,19 @@ def add_cols_to_delta_table(self, feature_group, new_features):
         ).save(feature_group.location)
 
     def _apply_transformation_function(
-        self, transformation_functions: List[TransformationFunction], dataset
+        self, transformation_functions: List[TransformationFunction], dataset: DataFrame
     ):
-        # generate transformation function expressions
+        """
+        Apply transformation function to the dataframe.
+
+        # Arguments
+            transformation_functions `List[TransformationFunction]` : List of transformation functions.
+            dataset `Union[DataFrame]`: A spark dataframe.
+        # Returns
+            `DataFrame`: A spark dataframe with the transformed data.
+        # Raises
+            `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View.
+        """
         transformed_features = set()
         transformations = []
         transformation_features = []
@@ -1180,62 +1242,32 @@ def _apply_transformation_function(
                 dataset.columns
             )
 
-            # TODO : Add documentation link in exception
             if missing_features:
                 raise FeatureStoreException(
-                    f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly. Refer .."
+                    f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
 
             transformed_features.update(
                 transformation_function.hopsworks_udf.transformation_features
             )
 
-            # TODO : Add statistics
             pandas_udf = hopsworks_udf.get_udf()
-            output_col_name = f'{hopsworks_udf.function_name}<{"-".join(hopsworks_udf.transformation_features)}>'
+            output_col_name = hopsworks_udf.output_column_names[0]
+
             transformations.append(pandas_udf)
-            transformation_features.append(hopsworks_udf.transformation_features)
             output_col_names.append(output_col_name)
+            transformation_features.append(hopsworks_udf.transformation_features)
 
-            if isinstance(hopsworks_udf.return_type, List):
+            if len(hopsworks_udf.output_types) > 1:
                 explode_name.append(f"{output_col_name}.*")
             else:
                 explode_name.append(output_col_name)
 
-            def timezone_decorator(func, trans_fn=hopsworks_udf):
-                if trans_fn.output_type != "TIMESTAMP":
-                    return func
-
-                current_timezone = tzlocal.get_localzone()
-
-                def decorated_func(x):
-                    result = func(x)
-                    if isinstance(result, datetime):
-                        if result.tzinfo is None:
-                            # if timestamp is timezone unaware, make sure it's localized to the system's timezone.
-                            # otherwise, spark will implicitly convert it to the system's timezone.
-                            return result.replace(tzinfo=current_timezone)
-                        else:
-                            # convert to utc, then localize to system's timezone
-                            return result.astimezone(timezone.utc).replace(
-                                tzinfo=current_timezone
-                            )
-                    return result
-
-                return decorated_func
-
-            # TODO : Timezone aware check see if I need to do also.
-            # self._spark_session.udf.register(
-            #    fn_registration_name,
-            #    timezone_decorator(transformation_fn.transformation_fn),
-            #    transformation_fn.output_type,
-            # )
-
-        # generate non transformation expressions
-
-        # generate entire expression and execute it
-
-        untransformed_columns = set(dataset.columns) - transformed_features
+        untransformed_columns = []  # Untransformed column maintained as a list since order is imported while selecting features.
+        for column in dataset.columns:
+            if column not in transformed_features:
+                untransformed_columns.append(column)
+        # Applying transformations
         transformed_dataset = dataset.select(
             *untransformed_columns,
             *[
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 34edaf4a64..554a3de9fd 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -175,7 +175,7 @@ def _validate_and_convert_output_types(
                 and output_type not in HopsworksUdf.PYTHON_SPARK_TYPE_MAPPING.values()
             ):
                 raise FeatureStoreException(
-                    f"Output type {output_type} is not supported. Please refer to DOCUMENTATION to get more information on the supported types."
+                    f"Output type {output_type} is not supported. Please refer to the documentation to get more information on the supported types."
                 )
             convert_output_types.append(
                 output_type
@@ -364,7 +364,7 @@ def _format_source_code(
         source_code = source_code.split("\n")
         # Reconstruct the modified function as a string
         modified_source = (
-            new_signature + "\n" + "\n\t".join(source_code[signature_end_line + 1 :])
+            new_signature + "\n\t" + "\n\t".join(source_code[signature_end_line + 1 :])
         )
 
         # Define a new function with the modified source code
@@ -377,13 +377,13 @@ def _get_output_column_names(self) -> str:
         # Returns
             `List[str]`: List of feature names for the transformed columns
         """
+        _BASE_COLUMN_NAME = (
+            f'{self.function_name}_{"-".join(self.transformation_features)}_'
+        )
         if len(self.output_types) > 1:
-            return [
-                f'{self.function_name}_{"-".join(self.transformation_features)}_{i}'
-                for i in range(len(self.output_types))
-            ]
+            return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.output_types))]
         else:
-            return [f'{self.function_name}_{"-".join(self.transformation_features)}_']
+            return [f"{_BASE_COLUMN_NAME}"]
 
     def _create_pandas_udf_return_schema_from_list(self) -> str:
         """
@@ -395,7 +395,7 @@ def _create_pandas_udf_return_schema_from_list(self) -> str:
         if len(self.output_types) > 1:
             return ", ".join(
                 [
-                    f"{self.output_column_names[i]} {self.output_types[i]}"
+                    f"`{self.output_column_names[i]}` {self.output_types[i]}"
                     for i in range(len(self.output_types))
                 ]
             )
@@ -412,20 +412,40 @@ def hopsworksUdf_wrapper(self) -> Callable:
         # Returns
             `Callable`: A wrapper function that renames outputs of the User defined function into specified output column names.
         """
+
+        # Function to make transformation function time safe. Defined as a string because it has to be dynamically injected into scope to be executed by spark
+        convert_timstamp_function = """def convert_timezone(date_time_col : pd.Series):
+        import tzlocal
+        current_timezone = tzlocal.get_localzone()
+        if date_time_col.dt.tz is None:
+            # if timestamp is timezone unaware, make sure it's localized to the system's timezone.
+            # otherwise, spark will implicitly convert it to the system's timezone.
+            return date_time_col.dt.tz_localize(str(current_timezone))
+        else:
+            # convert to utc, then localize to system's timezone
+            return date_time_col.dt.tz_convert('UTC').dt.tz_localize(None).dt.tz_localize(str(current_timezone))"""
+
         # Defining wrapper function that renames the column names to specific names
         if len(self.output_types) > 1:
-            code = f"""def renaming_wrapper(*args):
-    import pandas as pd
+            code = f"""import pandas as pd
+{convert_timstamp_function}
+def renaming_wrapper(*args):
     {self._formatted_function_source}
     df = {self.function_name}(*args)
     df = df.rename(columns = {{df.columns[i]: _output_col_names[i] for i in range(len(df.columns))}})
+    for col in df:
+        if pd.api.types.is_datetime64_any_dtype(df[col]):
+            df[col] = convert_timezone(df[col])
     return df"""
         else:
-            code = f"""def renaming_wrapper(*args):
-    import pandas as pd
+            code = f"""import pandas as pd
+{convert_timstamp_function}
+def renaming_wrapper(*args):
     {self._formatted_function_source}
     df = {self.function_name}(*args)
     df = df.rename(_output_col_names[0])
+    if pd.api.types.is_datetime64_any_dtype(df):
+        df = convert_timezone(df)
     return df"""
 
         # injecting variables into scope used to execute wrapper function.
diff --git a/python/hsfs/transformation_function_attached.py b/python/hsfs/transformation_function_attached.py
deleted file mode 100644
index ca4deceddb..0000000000
--- a/python/hsfs/transformation_function_attached.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright 2021. Logical Clocks AB
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from __future__ import annotations
-
-import humps
-from hsfs import transformation_function as transformation_fn
-
-
-class TransformationFunctionAttached:
-    def __init__(
-        self,
-        name,
-        transformation_function,
-        type=None,
-        items=None,
-        count=None,
-        href=None,
-        **kwargs,
-    ):
-        self._name = name
-        self._transformation_function = (
-            transformation_fn.TransformationFunction.from_response_json(
-                transformation_function
-            )
-            if isinstance(transformation_function, dict)
-            else transformation_function
-        )
-
-    @classmethod
-    def from_response_json(cls, json_dict):
-        json_decamelized = humps.decamelize(json_dict)
-        if "count" in json_decamelized:
-            if json_decamelized["count"] == 0:
-                return []
-            return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]]
-        else:
-            return cls(**json_decamelized)
-
-    def update_from_response_json(self, json_dict):
-        json_decamelized = humps.decamelize(json_dict)
-        self.__init__(**json_decamelized)
-        return self
-
-    @property
-    def name(self):
-        """Set feature name."""
-        return self._name
-
-    @name.setter
-    def name(self, name):
-        self._name = name
-
-    @property
-    def transformation_function(self):
-        """Set transformation functions."""
-        return self._transformation_function
-
-    @transformation_function.setter
-    def transformation_function(self, transformation_function):
-        self._transformation_function = transformation_function
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 2b3d69db4d..1ad6c8c5f4 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -143,6 +143,7 @@ exclude = [
     "site-packages",
     "venv",
     "java",
+    "python/tests/transformations_test_helper/" # transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases
 ]
 
 # Same as Black.
diff --git a/python/tests/core/test_arrow_flight_client.py b/python/tests/core/test_arrow_flight_client.py
index 0b647aedf1..faa480c6ad 100644
--- a/python/tests/core/test_arrow_flight_client.py
+++ b/python/tests/core/test_arrow_flight_client.py
@@ -77,9 +77,6 @@ def _arrange_featureview_mocks(self, mocker, backend_fixtures):
             "hsfs.core.feature_view_engine.FeatureViewEngine.get_batch_query",
             return_value=fg.select_all(),
         )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions"
-        )
         mocker.patch("hsfs.engine.python.Engine._apply_transformation_function")
 
         # required for batch query
diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py
index e50868285d..b1fb7ee08a 100644
--- a/python/tests/core/test_feature_view_engine.py
+++ b/python/tests/core/test_feature_view_engine.py
@@ -1641,7 +1641,6 @@ def test_get_training_dataset_metadata(self, mocker):
         # Assert
         assert mock_fv_api.return_value.get_training_dataset_by_version.call_count == 1
         assert result.schema == fv.schema
-        assert result.transformation_functions == fv.transformation_functions
 
     def test_create_training_data_metadata(self, mocker):
         # Arrange
diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py
index a1e28c49ae..5e77445971 100644
--- a/python/tests/core/test_training_dataset_engine.py
+++ b/python/tests/core/test_training_dataset_engine.py
@@ -23,6 +23,7 @@
 )
 from hsfs.constructor import query
 from hsfs.core import training_dataset_engine
+from hsfs.hopsworks_udf import hopsworks_udf
 
 
 class TestTrainingDatasetEngine:
@@ -111,20 +112,18 @@ def test_save_transformation_functions(self, mocker):
         feature_store_id = 99
 
         mocker.patch("hsfs.client.get_instance")
-        mocker.patch(
-            "hsfs.transformation_function.TransformationFunction._extract_source_code"
-        )
         mocker.patch(
             "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
         )
         mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
         mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
 
+        @hopsworks_udf(int)
         def plus_one(a):
             return a + 1
 
         tf = transformation_function.TransformationFunction(
-            1, plus_one, 1, "plus_one", output_type=str
+            hopsworks_udf=plus_one, featurestore_id=99
         )
 
         td = training_dataset.TrainingDataset(
diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py
index 29e20f3cac..51dd623ef1 100644
--- a/python/tests/core/test_transformation_function_engine.py
+++ b/python/tests/core/test_transformation_function_engine.py
@@ -306,7 +306,7 @@ def testFunction1(col1):
         # Assert
         assert (
             mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
-            == 1
+            == 0
         )
 
     def test_compute_and_set_feature_statistics_train_test_split(self, mocker):
@@ -364,7 +364,7 @@ def testFunction1(col1):
         # Assert
         assert (
             mock_s_engine.return_value.compute_transformation_fn_statistics.call_count
-            == 1
+            == 0
         )
 
     def test_get_and_set_feature_statistics_no_statistics_required(self, mocker):
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index 88ff95a34b..55267cc7ce 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -3234,7 +3234,7 @@ def test_add_file(self):
     def test_apply_transformation_function_pandas(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
-
+        engine._engine_type = "python"
         python_engine = python.Engine()
 
         @hopsworks_udf(int)
@@ -3271,10 +3271,98 @@ def plus_one(col1):
         assert result["plus_one_tf_name_"][0] == 2
         assert result["plus_one_tf_name_"][1] == 3
 
+    def test_apply_transformation_function_multiple_output(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+        engine._engine_type = "python"
+        python_engine = python.Engine()
+
+        @hopsworks_udf([int, int])
+        def plus_two(col1):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2})
+
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[plus_two],
+        )
+
+        df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]})
+
+        # Act
+        result = python_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions, dataset=df
+        )
+
+        # Assert
+        assert all(result.columns == ["col2", "plus_two_col1_0", "plus_two_col1_1"])
+        assert len(result) == 2
+        assert result["plus_two_col1_0"][0] == 2
+        assert result["plus_two_col1_0"][1] == 3
+        assert result["plus_two_col1_1"][0] == 3
+        assert result["plus_two_col1_1"][1] == 4
+
+    def test_apply_transformation_function_multiple_input_output(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+
+        engine._engine_type = "python"
+        python_engine = python.Engine()
+
+        @hopsworks_udf([int, int])
+        def plus_two(col1, col2):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
+
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[plus_two],
+        )
+
+        df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]})
+
+        # Act
+        result = python_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions, dataset=df
+        )
+
+        # Assert
+        assert all(result.columns == ["plus_two_col1-col2_0", "plus_two_col1-col2_1"])
+        assert len(result) == 2
+        assert result["plus_two_col1-col2_0"][0] == 2
+        assert result["plus_two_col1-col2_0"][1] == 3
+        assert result["plus_two_col1-col2_1"][0] == 12
+        assert result["plus_two_col1-col2_1"][1] == 13
+
     def test_apply_transformation_function_polars(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
 
+        engine._engine_type = "python"
         python_engine = python.Engine()
 
         @hopsworks_udf(int)
diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
index 5c7d76add0..09300059f3 100644
--- a/python/tests/engine/test_spark.py
+++ b/python/tests/engine/test_spark.py
@@ -23,6 +23,7 @@
     expectation_suite,
     feature,
     feature_group,
+    feature_view,
     storage_connector,
     training_dataset,
     training_dataset_feature,
@@ -33,6 +34,7 @@
 from hsfs.constructor import hudi_feature_group_alias, query
 from hsfs.core import training_dataset_engine
 from hsfs.engine import spark
+from hsfs.hopsworks_udf import hopsworks_udf
 from hsfs.training_dataset_feature import TrainingDatasetFeature
 from pyspark.sql import DataFrame
 from pyspark.sql.types import (
@@ -1729,9 +1731,6 @@ def test_write_training_dataset(self, mocker):
         mock_spark_engine_convert_to_default_dataframe = mocker.patch(
             "hsfs.engine.spark.Engine.convert_to_default_dataframe"
         )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions"
-        )
         mock_spark_engine_write_training_dataset_single = mocker.patch(
             "hsfs.engine.spark.Engine._write_training_dataset_single"
         )
@@ -1806,7 +1805,24 @@ def test_write_training_dataset_to_df(self, mocker, backend_fixtures):
             statistics_config=None,
             training_dataset_type=training_dataset.TrainingDataset.IN_MEMORY,
             extra_filter=None,
-            transformation_functions={},
+        )
+
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[],
         )
 
         # Act
@@ -1816,7 +1832,7 @@ def test_write_training_dataset_to_df(self, mocker, backend_fixtures):
             user_write_options={},
             save_mode=training_dataset_engine.TrainingDatasetEngine.OVERWRITE,
             read_options={},
-            feature_view_obj=None,
+            feature_view_obj=fv,
             to_df=True,
         )
 
@@ -1846,6 +1862,24 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures):
         query_df = spark_engine._spark_session.createDataFrame(df)
         mock_query_read.side_effect = [query_df]
 
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[],
+        )
+
         td = training_dataset.TrainingDataset(
             name="test",
             version=None,
@@ -1865,7 +1899,6 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures):
             training_dataset_type=training_dataset.TrainingDataset.IN_MEMORY,
             extra_filter=None,
             seed=1,
-            transformation_functions={},
         )
 
         # Act
@@ -1875,7 +1908,7 @@ def test_write_training_dataset_split_to_df(self, mocker, backend_fixtures):
             user_write_options={},
             save_mode=training_dataset_engine.TrainingDatasetEngine.OVERWRITE,
             read_options={},
-            feature_view_obj=None,
+            feature_view_obj=fv,
             to_df=True,
         )
 
@@ -1897,9 +1930,6 @@ def test_write_training_dataset_query(self, mocker):
         mock_spark_engine_convert_to_default_dataframe = mocker.patch(
             "hsfs.engine.spark.Engine.convert_to_default_dataframe"
         )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions"
-        )
         mock_spark_engine_write_training_dataset_single = mocker.patch(
             "hsfs.engine.spark.Engine._write_training_dataset_single"
         )
@@ -1910,6 +1940,24 @@ def test_write_training_dataset_query(self, mocker):
 
         spark_engine = spark.Engine()
 
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[],
+        )
+
         td = training_dataset.TrainingDataset(
             name="test",
             version=1,
@@ -1927,7 +1975,7 @@ def test_write_training_dataset_query(self, mocker):
             user_write_options=None,
             save_mode=None,
             read_options=None,
-            feature_view_obj=None,
+            feature_view_obj=fv,
             to_df=None,
         )
 
@@ -1948,9 +1996,6 @@ def test_write_training_dataset_query_coalesce(self, mocker):
         mock_spark_engine_convert_to_default_dataframe = mocker.patch(
             "hsfs.engine.spark.Engine.convert_to_default_dataframe"
         )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions"
-        )
         mock_spark_engine_write_training_dataset_single = mocker.patch(
             "hsfs.engine.spark.Engine._write_training_dataset_single"
         )
@@ -1961,6 +2006,24 @@ def test_write_training_dataset_query_coalesce(self, mocker):
 
         spark_engine = spark.Engine()
 
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[],
+        )
+
         td = training_dataset.TrainingDataset(
             name="test",
             version=1,
@@ -1979,7 +2042,7 @@ def test_write_training_dataset_query_coalesce(self, mocker):
             user_write_options=None,
             save_mode=None,
             read_options=None,
-            feature_view_obj=None,
+            feature_view_obj=fv,
             to_df=None,
         )
 
@@ -2000,9 +2063,6 @@ def test_write_training_dataset_td_splits(self, mocker):
         mock_spark_engine_convert_to_default_dataframe = mocker.patch(
             "hsfs.engine.spark.Engine.convert_to_default_dataframe"
         )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions"
-        )
         mock_spark_engine_write_training_dataset_single = mocker.patch(
             "hsfs.engine.spark.Engine._write_training_dataset_single"
         )
@@ -2013,6 +2073,24 @@ def test_write_training_dataset_td_splits(self, mocker):
 
         spark_engine = spark.Engine()
 
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[],
+        )
+
         td = training_dataset.TrainingDataset(
             name="test",
             version=1,
@@ -2034,7 +2112,7 @@ def test_write_training_dataset_td_splits(self, mocker):
             user_write_options=None,
             save_mode=None,
             read_options=None,
-            feature_view_obj=None,
+            feature_view_obj=fv,
             to_df=None,
         )
 
@@ -2056,9 +2134,6 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker):
         mock_spark_engine_convert_to_default_dataframe = mocker.patch(
             "hsfs.engine.spark.Engine.convert_to_default_dataframe"
         )
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine.populate_builtin_transformation_functions"
-        )
         mock_spark_engine_write_training_dataset_single = mocker.patch(
             "hsfs.engine.spark.Engine._write_training_dataset_single"
         )
@@ -2069,6 +2144,24 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker):
 
         spark_engine = spark.Engine()
 
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[],
+        )
+
         td = training_dataset.TrainingDataset(
             name="test",
             version=1,
@@ -2091,7 +2184,7 @@ def test_write_training_dataset_td_splits_coalesce(self, mocker):
             user_write_options=None,
             save_mode=None,
             read_options=None,
-            feature_view_obj=None,
+            feature_view_obj=fv,
             to_df=None,
         )
 
@@ -2575,20 +2668,15 @@ def test_write_training_dataset_splits(self, mocker):
 
         spark_engine = spark.Engine()
 
-        def plus_one(a) -> int:
-            return a + 1
+        @hopsworks_udf(int)
+        def plus_one(col1):
+            return col1 + 1
 
         tf = transformation_function.TransformationFunction(
             featurestore_id=99,
-            transformation_fn=plus_one,
-            builtin_source_code="",
-            output_type="int",
+            hopsworks_udf=plus_one,
         )
 
-        transformation_fn_dict = dict()
-
-        transformation_fn_dict["col_0"] = tf
-
         f = training_dataset_feature.TrainingDatasetFeature(
             name="col_0", type=IntegerType(), index=0
         )
@@ -2603,7 +2691,6 @@ def plus_one(a) -> int:
             data_format="CSV",
             featurestore_id=99,
             splits={},
-            transformation_functions=transformation_fn_dict,
             features=features,
         )
 
@@ -2614,6 +2701,7 @@ def plus_one(a) -> int:
             write_options=None,
             save_mode=None,
             to_df=False,
+            transformation_functions=[tf("col_0")],
         )
 
         # Assert
@@ -2629,14 +2717,13 @@ def test_write_training_dataset_splits_to_df(self, mocker):
 
         spark_engine = spark.Engine()
 
-        def plus_one(a) -> int:
-            return a + 1
+        @hopsworks_udf(int)
+        def plus_one(col1):
+            return col1 + 1
 
         tf = transformation_function.TransformationFunction(
             featurestore_id=99,
-            transformation_fn=plus_one,
-            builtin_source_code="",
-            output_type="int",
+            hopsworks_udf=plus_one,
         )
 
         transformation_fn_dict = dict()
@@ -2668,6 +2755,7 @@ def plus_one(a) -> int:
             write_options=None,
             save_mode=None,
             to_df=True,
+            transformation_functions=[tf("col_0")],
         )
 
         # Assert
@@ -4234,42 +4322,100 @@ def test_save_empty_dataframe(self, mocker):
         assert mock_spark_engine_save_dataframe.call_count == 1
         assert mock_spark_table.call_count == 1
 
-    def test_apply_transformation_function(self, mocker):
+    def test_apply_transformation_function_single_output(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
-
+        engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        def plus_one(a) -> int:
-            return a + 1
+        @hopsworks_udf(int)
+        def plus_one(col1):
+            return col1 + 1
 
         tf = transformation_function.TransformationFunction(
+            99,
+            hopsworks_udf=plus_one,
+        )
+
+        f = feature.Feature(name="col_0", type=IntegerType(), index=0)
+        f1 = feature.Feature(name="col_1", type=StringType(), index=1)
+        f2 = feature.Feature(name="col_2", type=BooleanType(), index=1)
+        features = [f, f1, f2]
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
             featurestore_id=99,
-            transformation_fn=plus_one,
-            builtin_source_code="",
-            output_type="long",
+            primary_key=[],
+            partition_key=[],
+            features=features,
+            id=11,
+            stream=False,
+        )
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=99,
+            query=fg1.select_all(),
+            transformation_functions=[tf("col_0")],
         )
 
-        transformation_fn_dict = dict()
+        d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False]}
+        df = pd.DataFrame(data=d)
 
-        transformation_fn_dict["col_0"] = tf
+        spark_df = spark_engine._spark_session.createDataFrame(df)
 
-        f = training_dataset_feature.TrainingDatasetFeature(
-            name="col_0", type=IntegerType(), index=0
+        expected_df = pd.DataFrame(
+            data={
+                "col_1": ["test_1", "test_2"],
+                "col_2": [True, False],
+                "plus_one_col_0_": [2, 3],
+            }
+        )  # todo why it doesnt return int?
+
+        expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df)
+
+        # Act
+        result = spark_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions,
+            dataset=spark_df,
         )
-        f1 = training_dataset_feature.TrainingDatasetFeature(
-            name="col_1", type=StringType(), index=1
+        # Assert
+        assert result.schema == expected_spark_df.schema
+        assert result.collect() == expected_spark_df.collect()
+
+    def test_apply_transformation_function_multiple_output(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+        engine._engine_type = "spark"
+        spark_engine = spark.Engine()
+
+        @hopsworks_udf([int, int])
+        def plus_two(col1):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2})
+
+        tf = transformation_function.TransformationFunction(
+            99,
+            hopsworks_udf=plus_two,
         )
-        features = [f, f1]
 
-        td = training_dataset.TrainingDataset(
-            name="test",
+        f = feature.Feature(name="col_0", type=IntegerType(), index=0)
+        f1 = feature.Feature(name="col_1", type=StringType(), index=1)
+        f2 = feature.Feature(name="col_2", type=BooleanType(), index=1)
+        features = [f, f1, f2]
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
             version=1,
-            data_format="CSV",
             featurestore_id=99,
-            splits={},
+            primary_key=[],
+            partition_key=[],
             features=features,
-            transformation_functions=transformation_fn_dict,
+            id=11,
+            stream=False,
+        )
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=99,
+            query=fg1.select_all(),
+            transformation_functions=[tf("col_0")],
         )
 
         d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [True, False]}
@@ -4279,9 +4425,10 @@ def plus_one(a) -> int:
 
         expected_df = pd.DataFrame(
             data={
-                "col_0": [2, 3],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "plus_two_col_0_0": [2, 3],
+                "plus_two_col_0_1": [3, 4],
             }
         )  # todo why it doesnt return int?
 
@@ -4289,10 +4436,69 @@ def plus_one(a) -> int:
 
         # Act
         result = spark_engine._apply_transformation_function(
-            transformation_functions=td.transformation_functions,
+            transformation_functions=fv.transformation_functions,
             dataset=spark_df,
         )
+        # Assert
+        assert result.schema == expected_spark_df.schema
+        assert result.collect() == expected_spark_df.collect()
+
+    def test_apply_transformation_function_multiple_input_output(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+        engine._engine_type = "spark"
+        spark_engine = spark.Engine()
+
+        @hopsworks_udf([int, int])
+        def test(col1, col2):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
+
+        tf = transformation_function.TransformationFunction(
+            99,
+            hopsworks_udf=test,
+        )
+
+        f = feature.Feature(name="col_0", type=IntegerType(), index=0)
+        f1 = feature.Feature(name="col_1", type=StringType(), index=1)
+        f2 = feature.Feature(name="col_2", type=IntegerType(), index=1)
+        features = [f, f1, f2]
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=features,
+            id=11,
+            stream=False,
+        )
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=99,
+            query=fg1.select_all(),
+            transformation_functions=[tf("col_0", "col_2")],
+        )
 
+        d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]}
+        df = pd.DataFrame(data=d)
+
+        spark_df = spark_engine._spark_session.createDataFrame(df)
+
+        expected_df = pd.DataFrame(
+            data={
+                "col_1": ["test_1", "test_2"],
+                "test_col_0-col_2_0": [2, 3],
+                "test_col_0-col_2_1": [12, 13],
+            }
+        )  # todo why it doesnt return int?
+
+        expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df)
+
+        # Act
+        result = spark_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions,
+            dataset=spark_df,
+        )
         # Assert
         assert result.schema == expected_spark_df.schema
         assert result.collect() == expected_spark_df.collect()
diff --git a/python/tests/fixtures/backend_fixtures.py b/python/tests/fixtures/backend_fixtures.py
index 34a2c9e594..5a7029172f 100644
--- a/python/tests/fixtures/backend_fixtures.py
+++ b/python/tests/fixtures/backend_fixtures.py
@@ -56,7 +56,6 @@
     "training_dataset_feature",
     "training_dataset",
     "training_dataset_split",
-    "transformation_function_attached",
     "transformation_function",
     "user",
     "validation_report",
diff --git a/python/tests/pyproject.toml b/python/tests/pyproject.toml
index 15a77ff4fd..3d36a4588e 100644
--- a/python/tests/pyproject.toml
+++ b/python/tests/pyproject.toml
@@ -8,6 +8,8 @@ ignore = [
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]
 unfixable = []
+# transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases
+exclude = ["transformations_test_helper/"]
 
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
@@ -29,3 +31,9 @@ skip-magic-trailing-comma = false
 
 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
+
+[tool.pytest.ini_options]
+pythonpath = [
+  ".", "tests"
+]
+addopts = "--ignore=python/tests/transformations_test_helper/"
diff --git a/python/tests/test_transformation_function_attached.py b/python/tests/test_transformation_function_attached.py
deleted file mode 100644
index 85effdd06e..0000000000
--- a/python/tests/test_transformation_function_attached.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#
-#   Copyright 2022 Hopsworks AB
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use this file except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-#
-
-
-from hsfs import transformation_function, transformation_function_attached
-
-
-class TestTransformationFunctionAttached:
-    def test_from_response_json(self, backend_fixtures):
-        # Arrange
-        json = backend_fixtures["transformation_function_attached"]["get"]["response"]
-
-        # Act
-        tf_attached = transformation_function_attached.TransformationFunctionAttached.from_response_json(
-            json
-        )
-
-        # Assert
-        assert tf_attached.name == "test_name"
-        assert isinstance(
-            tf_attached.transformation_function,
-            transformation_function.TransformationFunction,
-        )
-
-    def test_from_response_json_basic_info(self, backend_fixtures):
-        # Arrange
-        json = backend_fixtures["transformation_function_attached"]["get_basic_info"][
-            "response"
-        ]
-
-        # Act
-        tf_attached = transformation_function_attached.TransformationFunctionAttached.from_response_json(
-            json
-        )
-
-        # Assert
-        assert tf_attached.name == "test_name"
-        assert isinstance(
-            tf_attached.transformation_function,
-            transformation_function.TransformationFunction,
-        )
-
-    def test_from_response_json_list(self, backend_fixtures):
-        # Arrange
-        json = backend_fixtures["transformation_function_attached"]["get_list"][
-            "response"
-        ]
-
-        # Act
-        tf_attached_list = transformation_function_attached.TransformationFunctionAttached.from_response_json(
-            json
-        )
-
-        # Assert
-        assert len(tf_attached_list) == 1
-        tf_attached = tf_attached_list[0]
-        assert tf_attached.name == "test_name"
-        assert isinstance(
-            tf_attached.transformation_function,
-            transformation_function.TransformationFunction,
-        )
-
-    def test_from_response_json_list_empty(self, backend_fixtures):
-        # Arrange
-        json = backend_fixtures["transformation_function_attached"]["get_list_empty"][
-            "response"
-        ]
-
-        # Act
-        tf_attached_list = transformation_function_attached.TransformationFunctionAttached.from_response_json(
-            json
-        )
-
-        # Assert
-        assert len(tf_attached_list) == 0

From 5608c18d15d9b3665bb374a48dbd9c2cb1debdbc Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 13:26:15 +0200
Subject: [PATCH 19/58] all unit tests working

---
 python/hsfs/builtin_transformations.py        |  67 ++
 .../core/builtin_transformation_function.py   | 107 ---
 python/hsfs/feature_store.py                  |   2 +-
 python/hsfs/feature_view.py                   |   7 +-
 python/hsfs/hopsworks_udf.py                  |  70 +-
 python/hsfs/transformation_function.py        |   5 +-
 ...t_python_spark_transformation_functions.py | 710 +++++++++++-------
 .../tests/fixtures/feature_view_fixtures.json |  27 +-
 .../transformation_function_fixtures.json     |  18 +
 python/tests/test_transformation_function.py  |  24 +
 10 files changed, 592 insertions(+), 445 deletions(-)
 create mode 100644 python/hsfs/builtin_transformations.py
 delete mode 100644 python/hsfs/core/builtin_transformation_function.py

diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
new file mode 100644
index 0000000000..d17ae6f1fa
--- /dev/null
+++ b/python/hsfs/builtin_transformations.py
@@ -0,0 +1,67 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+import pandas as pd
+from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
+from hsfs.hopsworks_udf import hopsworks_udf
+
+
+@hopsworks_udf(float)
+def min_max_scaler(feature: pd.Series, statistics_feature) -> pd.Series:
+    return (feature - statistics_feature.min) / (
+        statistics_feature.max - statistics_feature.min
+    )
+
+
+@hopsworks_udf(float)
+def standard_scaler(
+    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
+) -> pd.Series:
+    return (feature - statistics_feature.mean) / statistics_feature.stddev
+
+
+@hopsworks_udf(float)
+def robust_scaler(
+    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
+) -> pd.Series:
+    return (feature - statistics_feature.percentiles[49]) / (
+        statistics_feature.percentiles[74] - statistics_feature.percentiles[24]
+    )
+
+
+# @hopsworks_udf(int)
+def label_encoder(
+    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
+) -> pd.Series:
+    unique_data = [
+        value for value in statistics_feature.extended_statistics["unique_values"]
+    ]
+    value_to_index = {value: index for index, value in enumerate(unique_data)}
+    return pd.Series([value_to_index[data] for data in feature])
+
+
+def one_hot_encoder(
+    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
+) -> pd.Series:
+    unique_data = [
+        value for value in statistics_feature.extended_statistics["unique_values"]
+    ]
+    print(statistics_feature.extended_statistics["unique_values"])
+    one_hot = pd.get_dummies(feature, dtype="bool")
+    for data in unique_data:
+        if data not in one_hot:
+            one_hot[data] = False
+    return one_hot
diff --git a/python/hsfs/core/builtin_transformation_function.py b/python/hsfs/core/builtin_transformation_function.py
deleted file mode 100644
index 7ef5b63555..0000000000
--- a/python/hsfs/core/builtin_transformation_function.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#
-#   Copyright 2021 Logical Clocks AB
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use this file except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-#
-from __future__ import annotations
-
-from typing import List
-
-from hsfs.client.exceptions import FeatureStoreException
-from hsfs.core import feature_descriptive_statistics as fds
-
-
-class BuiltInTransformationFunction:
-    def __init__(self, method):
-        self._method = method.lower()
-
-    @staticmethod
-    def min_max_scaler_stats(
-        feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics],
-        feature_name: str,
-    ):
-        min_value = None
-        max_value = None
-        for stats in feature_descriptive_stats:
-            if stats.feature_name == feature_name:
-                if stats.feature_type not in ["Integral", "Fractional", "Decimal"]:
-                    raise ValueError("Can't compute min_max_scaler for this type")
-                min_value = stats.min
-                max_value = stats.max
-
-        if min_value is None or max_value is None:
-            raise FeatureStoreException(
-                "Feature {feature_name:} doesn't have minimum and/or maximum values computed. Thus can't use "
-                "min_max_scaler method".format(feature_name=feature_name)
-            )
-        return min_value, max_value
-
-    @staticmethod
-    def standard_scaler_stats(
-        feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics],
-        feature_name: str,
-    ):
-        mean = None
-        std_dev = None
-        for stats in feature_descriptive_stats:
-            if stats.feature_name == feature_name:
-                if stats.feature_type not in ["Integral", "Fractional", "Decimal"]:
-                    raise ValueError("Can't compute standard_scaler for this type")
-                mean = stats.mean
-                std_dev = stats.stddev
-
-        if mean is None or std_dev is None:
-            raise FeatureStoreException(
-                "Feature {feature_name:} doesn't have mean and/or standard deviation computed. Thus can't use "
-                "standard_scaler method".format(feature_name=feature_name)
-            )
-        return mean, std_dev
-
-    @staticmethod
-    def robust_scaler_stats(
-        feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics],
-        feature_name: str,
-    ):
-        percentiles = None
-        for stats in feature_descriptive_stats:
-            if stats.feature_name == feature_name:
-                if stats.feature_type not in ["Integral", "Fractional", "Decimal"]:
-                    raise ValueError("Can't compute robust_scaler for this type")
-                if stats.percentiles is not None and len(stats.percentiles) > 0:
-                    percentiles = stats.percentiles
-
-        if percentiles is None:
-            raise FeatureStoreException(
-                "Feature {feature_name:} doesn't have mean and/or standard deviation computed. Thus can't use "
-                "standard_scaler method".format(feature_name=feature_name)
-            )
-        return percentiles
-
-    @staticmethod
-    def encoder_stats(
-        feature_descriptive_stats: List[fds.FeatureDescriptiveStatistics],
-        feature_name: str,
-    ):
-        for stats in feature_descriptive_stats:
-            if (
-                stats.feature_name == feature_name
-                and stats.extended_statistics is not None
-                and "unique_values" in stats.extended_statistics
-            ):
-                unique_data = [
-                    value for value in stats.extended_statistics["unique_values"]
-                ]
-                value_to_index = dict(
-                    (value, index) for index, value in enumerate(unique_data)
-                )
-                return value_to_index
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 10f6a269bc..41d1a754ff 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -1316,7 +1316,7 @@ def plus_one(value):
         """
         return TransformationFunction(
             featurestore_id=self._id,
-            transformation_fn=transformation_function,
+            hopsworks_udf=transformation_function,
             version=version,
         )
 
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 7c8a914dd4..5b90fabfc2 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -3422,9 +3422,10 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
             description=json_decamelized.get("description", None),
             featurestore_name=json_decamelized.get("featurestore_name", None),
             serving_keys=serving_keys,
-            transformation_functions=TransformationFunction.from_response_json(
-                transformation_functions
-            )
+            transformation_functions=[
+                TransformationFunction.from_response_json(transformation_function)
+                for transformation_function in transformation_functions
+            ]
             if transformation_functions
             else [],
         )
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 554a3de9fd..9ed60ead0d 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -144,8 +144,10 @@ def __init__(
             else transformation_features
         )
 
-        self._formatted_function_source = HopsworksUdf._format_source_code(
-            self._function_source, self._transformation_features
+        self._formatted_function_source, self._module_imports = (
+            HopsworksUdf._format_source_code(
+                self._function_source, self._transformation_features
+            )
         )
 
         self._output_column_names: List[str] = self._get_output_column_names()
@@ -214,30 +216,6 @@ def _get_module_imports(path: str) -> List[str]:
                 imports.append(import_line)
         return imports
 
-    @staticmethod
-    def _get_module_path(module_name: str) -> str:
-        """
-        Function that returns the path to the source code of a python module.
-
-        Cannot extract path if the module is defined in a jupyter notebook since it is currently impossible find the path of a jupyter notebook.(https://github.com/ipython/ipython/issues/10123)
-
-        # Arguments
-            path: `str`. Path to python file from which imports are to be extracted.
-        # Raises
-            AttributeError : If the provided module is defined in a jupyter notebook.
-        # Returns
-            `str`: a string that contains the path to the module
-        """
-
-        def _get_module_path(module):
-            return module.__file__
-
-        module_path = {}
-        exec(
-            f'import {module_name}\nmodule_path["path"] = _get_module_path({module_name})'
-        )
-        return module_path["path"]
-
     @staticmethod
     def _extract_source_code(udf_function: Callable) -> str:
         """
@@ -252,12 +230,12 @@ def _extract_source_code(udf_function: Callable) -> str:
         """
         try:
             module_imports = HopsworksUdf._get_module_imports(
-                HopsworksUdf._get_module_path(udf_function.__module__)
+                inspect.getfile(udf_function)
             )
-        except AttributeError:
+        except FileNotFoundError:
             module_imports = [""]
             warnings.warn(
-                "Passed UDF defined in a Jupyter notebook. Cannot extract import dependencies from a notebook. Please make sure to import all dependencies for the UDF inside the function.",
+                "Cannot extract imported dependencies for the function module. Please make sure to import all dependencies for the UDF inside the function.",
                 stacklevel=2,
             )
 
@@ -340,7 +318,7 @@ def _extract_function_arguments(source_code: str) -> List[TransformationFeature]
     @staticmethod
     def _format_source_code(
         source_code: str, transformation_features: List[TransformationFeature]
-    ) -> str:
+    ) -> Tuple[str, str]:
         """
         Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code.
 
@@ -348,13 +326,13 @@ def _format_source_code(
             source_code: `str`. Source code of a function.
             transformation_features `List[TransformationFeature]`: List of transformation features provided in the function argument.
         # Returns
-            `str`: Source code that does not contain any decorators, type hints or statistics parameters.
+            `Tuple[str, str]`: Tuple that contains Source code that does not contain any decorators, type hints or statistics parameters and the module imports
         """
 
         _, signature, _, signature_end_line = HopsworksUdf._parse_function_signature(
             source_code
         )
-
+        module_imports = source_code.split("@")[0]
         arg_list = [feature.feature_name for feature in transformation_features]
 
         # Reconstruct the function signature
@@ -367,8 +345,7 @@ def _format_source_code(
             new_signature + "\n\t" + "\n\t".join(source_code[signature_end_line + 1 :])
         )
 
-        # Define a new function with the modified source code
-        return modified_source
+        return modified_source, module_imports
 
     def _get_output_column_names(self) -> str:
         """
@@ -423,11 +400,14 @@ def hopsworksUdf_wrapper(self) -> Callable:
             return date_time_col.dt.tz_localize(str(current_timezone))
         else:
             # convert to utc, then localize to system's timezone
-            return date_time_col.dt.tz_convert('UTC').dt.tz_localize(None).dt.tz_localize(str(current_timezone))"""
+            return date_time_col.dt.tz_localize(None).dt.tz_localize(str(current_timezone))"""
 
         # Defining wrapper function that renames the column names to specific names
         if len(self.output_types) > 1:
-            code = f"""import pandas as pd
+            code = (
+                self._module_imports
+                + "\n"
+                + f"""import pandas as pd
 {convert_timstamp_function}
 def renaming_wrapper(*args):
     {self._formatted_function_source}
@@ -437,8 +417,12 @@ def renaming_wrapper(*args):
         if pd.api.types.is_datetime64_any_dtype(df[col]):
             df[col] = convert_timezone(df[col])
     return df"""
+            )
         else:
-            code = f"""import pandas as pd
+            code = (
+                self._module_imports
+                + "\n"
+                + f"""import pandas as pd
 {convert_timstamp_function}
 def renaming_wrapper(*args):
     {self._formatted_function_source}
@@ -447,13 +431,13 @@ def renaming_wrapper(*args):
     if pd.api.types.is_datetime64_any_dtype(df):
         df = convert_timezone(df)
     return df"""
-
+            )
+        print(code)
         # injecting variables into scope used to execute wrapper function.
         scope = __import__("__main__").__dict__
         if self.transformation_statistics is not None:
             scope.update(self.transformation_statistics)
         scope.update({"_output_col_names": self.output_column_names})
-
         # executing code
         exec(code, scope)
 
@@ -524,12 +508,8 @@ def to_dict(self) -> Dict[str, Any]:
             `Dict`: Dictionary that contains all data required to json serialize the object.
         """
         return {
-            "sourceCode": self._original_code,
-            "outputTypes": ",".join(
-                [python_type.__name__ for python_type in self.output_types]
-            )
-            if isinstance(self.output_types, List)
-            else self.output_types.__name__,
+            "sourceCode": self._function_source,
+            "outputTypes": self.output_types,
             "transformationFeatures": self.transformation_features,
             "name": self._function_name,
         }
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index 4e23853c73..b6ef060cb9 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -154,7 +154,10 @@ def from_response_json(
                     tffn_dto["hopsworks_udf"] = HopsworksUdf.from_response_json(
                         tffn_dto["hopsworks_udf"]
                     )
-            return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]]
+            if json_decamelized["count"] == 1:
+                return cls(**json_decamelized["items"][0])
+            else:
+                return [cls(**tffn_dto) for tffn_dto in json_decamelized["items"]]
         else:
             if json_decamelized.get("hopsworks_udf", False):
                 json_decamelized["hopsworks_udf"] = HopsworksUdf.from_response_json(
diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
index 0e25037751..f5763ea548 100644
--- a/python/tests/engine/test_python_spark_transformation_functions.py
+++ b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -18,24 +18,25 @@
 import datetime
 import statistics
 
-import numpy as np
 import pandas as pd
 import pytest
-import pytz
 import tzlocal
 from hsfs import (
+    engine,
     training_dataset,
     training_dataset_feature,
     transformation_function,
 )
+from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
-from hsfs.core.transformation_function_engine import TransformationFunctionEngine
 from hsfs.engine import python, spark
+from hsfs.hopsworks_udf import HopsworksUdf, hopsworks_udf
 from pyspark.sql.types import (
     BooleanType,
     DateType,
     DoubleType,
     IntegerType,
+    LongType,
     StringType,
     StructField,
     StructType,
@@ -44,27 +45,7 @@
 
 
 class TestPythonSparkTransformationFunctions:
-    def _create_training_dataset(
-        self, tf_fun, output_type=None, name=None, col="col_0"
-    ):
-        if isinstance(tf_fun, str):
-            tf = transformation_function.TransformationFunction(
-                name=name,
-                featurestore_id=99,
-                transformation_fn=None,
-                source_code_content=tf_fun,
-                output_type=output_type,
-            )
-        else:
-            tf = transformation_function.TransformationFunction(
-                featurestore_id=99,
-                transformation_fn=tf_fun,
-                builtin_source_code=None,
-                output_type=output_type,
-            )
-        transformation_fn_dict = dict()
-        transformation_fn_dict[col] = tf
-
+    def _create_training_dataset(self):
         f = training_dataset_feature.TrainingDatasetFeature(
             name="col_0", type=IntegerType(), index=0
         )
@@ -83,18 +64,18 @@ def _create_training_dataset(
             featurestore_id=99,
             splits={},
             features=features,
-            transformation_functions=transformation_fn_dict,
         )
 
         return td
 
-    def _validate_on_python_engine(self, td, df, expected_df):
+    def _validate_on_python_engine(self, td, df, expected_df, transformation_functions):
         # Arrange
+        engine._engine_type = "python"
         python_engine = python.Engine()
 
         # Act
         result = python_engine._apply_transformation_function(
-            transformation_functions=td.transformation_functions,
+            transformation_functions=transformation_functions,
             dataset=df,
         )
 
@@ -102,13 +83,16 @@ def _validate_on_python_engine(self, td, df, expected_df):
         assert list(result.dtypes) == list(expected_df.dtypes)
         assert result.equals(expected_df)
 
-    def _validate_on_spark_engine(self, td, spark_df, expected_spark_df):
+    def _validate_on_spark_engine(
+        self, td, spark_df, expected_spark_df, transformation_functions
+    ):
         # Arrange
+        engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
         # Act
         result = spark_engine._apply_transformation_function(
-            transformation_functions=td.transformation_functions,
+            transformation_functions=transformation_functions,
             dataset=spark_df,
         )
 
@@ -116,9 +100,10 @@ def _validate_on_spark_engine(self, td, spark_df, expected_spark_df):
         assert result.schema == expected_spark_df.schema
         assert result.collect() == expected_spark_df.collect()
 
-    def test_apply_builtin_minmax(self, mocker):
+    def test_apply_builtin_minmax_from_backend(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
         spark_engine = spark.Engine()
 
         schema = StructType(
@@ -139,16 +124,16 @@ def test_apply_builtin_minmax(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", DoubleType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("min_max_scaler_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [0.5, 1.0],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "min_max_scaler_col_0_": [0.0, 1.0],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
@@ -156,34 +141,43 @@ def test_apply_builtin_minmax(self, mocker):
         )
 
         # Arrange
-        tf_fun = (
-            '{"module_imports": "from datetime import datetime", "transformer_code": '
-            '"def min_max_scaler(value, min_value,max_value):\\n    if value is None:\\n        '
-            "return None\\n    else:\\n        try:\\n            return (value - min_value) / (max_value - min_value)\\n"
-            '        except ZeroDivisionError:\\n            return 0\\n"}'
-        )
-
-        td = self._create_training_dataset(tf_fun, "DOUBLE", "min_max_scaler")
-
-        td.transformation_functions["col_0"] = (
-            TransformationFunctionEngine.populate_builtin_fn_arguments(
-                "col_0",
-                td.transformation_functions["col_0"],
-                [
-                    FeatureDescriptiveStatistics(
-                        feature_name="col_0", feature_type="Integral", min=0, max=2
-                    )
-                ],
+        tf_fun_source = (
+            "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n"
+            "from hsfs.hopsworks_udf import hopsworks_udf\n"
+            "@hopsworks_udf(float)\ndef min_max_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n"
+            "    return (feature - statistics_feature.min)/(statistics_feature.max-statistics_feature.min)\n"
+        )
+        udf_response = {
+            "sourceCode": tf_fun_source,
+            "outputTypes": "double",
+            "transformationFeatures": "",
+            "name": "min_max_scaler",
+        }
+
+        tf_fun = HopsworksUdf.from_response_json(udf_response)
+
+        td = self._create_training_dataset()
+
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun("col_0"), featurestore_id=99
             )
-        )
+        ]
+
+        transformation_functions[0].hopsworks_udf.transformation_statistics = [
+            FeatureDescriptiveStatistics(feature_name="col_0", min=1, max=2)
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_builtin_labelencoder(self, mocker):
+    def test_apply_builtin_minmax(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
         spark_engine = spark.Engine()
 
         schema = StructType(
@@ -204,53 +198,47 @@ def test_apply_builtin_labelencoder(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
-                StructField("col_1", IntegerType(), True),
+                StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("min_max_scaler_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [1, 2],
-                "col_1": [0, 1],
+                "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "min_max_scaler_col_0_": [0.0, 1.0],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
             expected_df, schema=expected_schema
         )
-        expected_df["col_1"] = expected_df["col_1"].astype(pd.Int32Dtype())
 
         # Arrange
-        tf_fun = (
-            '{"module_imports": "", "transformer_code": "# label encoder\\n'
-            "def label_encoder(value, value_to_index):\\n"
-            "    # define a mapping of values to integers\\n"
-            '    return value_to_index[value]"}'
-        )
+        from hsfs.builtin_transformations import min_max_scaler
 
-        td = self._create_training_dataset(tf_fun, "INT", "label_encoder", "col_1")
+        td = self._create_training_dataset()
 
-        td.transformation_functions["col_1"] = (
-            TransformationFunctionEngine.populate_builtin_fn_arguments(
-                "col_1",
-                td.transformation_functions["col_1"],
-                [
-                    FeatureDescriptiveStatistics(
-                        feature_name="col_1",
-                        extended_statistics={"unique_values": ["test_1", "test_2"]},
-                    )
-                ],
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=min_max_scaler("col_0"), featurestore_id=99
             )
-        )
+        ]
+
+        transformation_functions[0].hopsworks_udf.transformation_statistics = [
+            FeatureDescriptiveStatistics(feature_name="col_0", min=1, max=2)
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_builtin_standard_scaler(self, mocker):
+    def test_apply_builtin_standard_scaler_from_backend(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
         spark_engine = spark.Engine()
 
         schema = StructType(
@@ -271,16 +259,16 @@ def test_apply_builtin_standard_scaler(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", DoubleType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("standard_scaler_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [-1.0, 1.0],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "standard_scaler_col_0_": [-1.0, 1.0],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
@@ -288,39 +276,44 @@ def test_apply_builtin_standard_scaler(self, mocker):
         )
 
         # Arrange
-        tf_fun = (
-            '{"module_imports": "from datetime import datetime", "transformer_code": "'
-            "def standard_scaler(value, mean, std_dev):\\n    if value is None:\\n        return None\\n    "
-            "else:\\n        try:\\n            return (value - mean) / std_dev\\n        except "
-            'ZeroDivisionError:\\n            return 0\\n"}'
-        )
-
-        td = self._create_training_dataset(tf_fun, "DOUBLE", "standard_scaler")
-
+        tf_fun_source = (
+            "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n"
+            "from hsfs.hopsworks_udf import hopsworks_udf\n"
+            "@hopsworks_udf(float)\ndef standard_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n"
+            "    return (feature - statistics_feature.mean)/statistics_feature.stddev\n"
+        )
+        udf_response = {
+            "sourceCode": tf_fun_source,
+            "outputTypes": "double",
+            "transformationFeatures": "",
+            "name": "standard_scaler",
+        }
+
+        tf_fun = HopsworksUdf.from_response_json(udf_response)
+
+        td = self._create_training_dataset()
+
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun("col_0"), featurestore_id=99
+            )
+        ]
         mean = statistics.mean([1, 2])
         stddev = statistics.pstdev([1, 2])
-        td.transformation_functions["col_0"] = (
-            TransformationFunctionEngine.populate_builtin_fn_arguments(
-                "col_0",
-                td.transformation_functions["col_0"],
-                [
-                    FeatureDescriptiveStatistics(
-                        feature_name="col_0",
-                        feature_type="Integral",
-                        mean=mean,
-                        stddev=stddev,
-                    )
-                ],
-            )
-        )
+        transformation_functions[0].hopsworks_udf.transformation_statistics = [
+            FeatureDescriptiveStatistics(feature_name="col_0", mean=mean, stddev=stddev)
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_builtin_robustscaler(self, mocker):
+    def test_apply_builtin_standard_scaler(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
         spark_engine = spark.Engine()
 
         schema = StructType(
@@ -341,16 +334,16 @@ def test_apply_builtin_robustscaler(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", DoubleType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("standard_scaler_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [-1.0, 0.0],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "standard_scaler_col_0_": [-1.0, 1.0],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
@@ -358,40 +351,32 @@ def test_apply_builtin_robustscaler(self, mocker):
         )
 
         # Arrange
-        tf_fun = (
-            '{"module_imports": "from datetime import datetime", "transformer_code": "'
-            "def robust_scaler(value, p25, p50, p75):\\n    if value is None:\\n        "
-            "return None\\n    else:\\n        try:\\n            return (value - p50) / (p75 - p25)\\n        "
-            'except ZeroDivisionError:\\n            return 0\\n"}\n'
-        )
+        from hsfs.builtin_transformations import standard_scaler
 
-        td = self._create_training_dataset(tf_fun, "DOUBLE", "robust_scaler")
+        td = self._create_training_dataset()
 
-        percentiles = [1] * 100
-        percentiles[24] = 1
-        percentiles[49] = 2
-        percentiles[74] = 2
-        td.transformation_functions["col_0"] = (
-            TransformationFunctionEngine.populate_builtin_fn_arguments(
-                "col_0",
-                td.transformation_functions["col_0"],
-                [
-                    FeatureDescriptiveStatistics(
-                        feature_name="col_0",
-                        feature_type="Integral",
-                        percentiles=percentiles,
-                    )
-                ],
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=standard_scaler("col_0"), featurestore_id=99
             )
-        )
+        ]
+
+        mean = statistics.mean([1, 2])
+        stddev = statistics.pstdev([1, 2])
+        transformation_functions[0].hopsworks_udf.transformation_statistics = [
+            FeatureDescriptiveStatistics(feature_name="col_0", mean=mean, stddev=stddev)
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_int(self, mocker):
+    def test_apply_builtin_robust_scaler_from_backend(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
         spark_engine = spark.Engine()
 
         schema = StructType(
@@ -412,36 +397,63 @@ def test_apply_plus_one_int(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("robust_scaler_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [2, 3],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "robust_scaler_col_0_": [-1.0, 0],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
             expected_df, schema=expected_schema
         )
-        expected_df["col_0"] = expected_df["col_0"].astype(pd.Int32Dtype())
 
         # Arrange
-        def tf_fun(a) -> int:
-            return a + 1
-
-        td = self._create_training_dataset(tf_fun, "int")
+        tf_fun_source = (
+            "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n"
+            "from hsfs.hopsworks_udf import hopsworks_udf\n"
+            "@hopsworks_udf(float)\ndef robust_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n"
+            "    return (feature - statistics_feature.percentiles[49])/(statistics_feature.percentiles[74]-statistics_feature.percentiles[24])\n"
+        )
+        udf_response = {
+            "sourceCode": tf_fun_source,
+            "outputTypes": "double",
+            "transformationFeatures": "",
+            "name": "robust_scaler",
+        }
+
+        tf_fun = HopsworksUdf.from_response_json(udf_response)
+
+        td = self._create_training_dataset()
+
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun("col_0"), featurestore_id=99
+            )
+        ]
+        percentiles = [1] * 100
+        percentiles[24] = 1
+        percentiles[49] = 2
+        percentiles[74] = 2
+        transformation_functions[0].hopsworks_udf.transformation_statistics = [
+            FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles)
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_str(self, mocker):
+    def test_apply_builtin_robust_scaler(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.core.statistics_engine.StatisticsEngine._save_statistics")
         spark_engine = spark.Engine()
 
         schema = StructType(
@@ -462,16 +474,16 @@ def test_apply_plus_one_str(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", StringType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("robust_scaler_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": ["2", "3"],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "robust_scaler_col_0_": [-1.0, 0],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
@@ -479,16 +491,31 @@ def test_apply_plus_one_str(self, mocker):
         )
 
         # Arrange
-        def tf_fun(a) -> int:
-            return a + 1
+        from hsfs.builtin_transformations import robust_scaler
+
+        td = self._create_training_dataset()
 
-        td = self._create_training_dataset(tf_fun, "string")
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=robust_scaler("col_0"), featurestore_id=99
+            )
+        ]
+
+        percentiles = [1] * 100
+        percentiles[24] = 1
+        percentiles[49] = 2
+        percentiles[74] = 2
+        transformation_functions[0].hopsworks_udf.transformation_statistics = [
+            FeatureDescriptiveStatistics(feature_name="col_0", percentiles=percentiles)
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_double(self, mocker):
+    def test_apply_plus_one_int(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
@@ -507,96 +534,103 @@ def test_apply_plus_one_double(self, mocker):
                 "col_2": [True, False],
             }
         )
+        spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         expected_schema = StructType(
             [
-                StructField("col_0", DoubleType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", LongType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [2.0, 3.0],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [2, 3],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
             expected_df, schema=expected_schema
         )
-        spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         # Arrange
-        def tf_fun(a) -> np.float64:
-            return a + 1.0
+        @hopsworks_udf(int)
+        def tf_fun(col_0):
+            return col_0 + 1
+
+        td = self._create_training_dataset()
 
-        td = self._create_training_dataset(tf_fun, "double")
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_datetime_no_tz(self, mocker):
+    def test_apply_plus_one_str(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
 
         schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
+                StructField("col_0", StringType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
             ]
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1640995200, 1640995201],
+                "col_0": ["1", "2"],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
         )
-
         spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         expected_schema = StructType(
             [
-                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", StringType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [
-                    datetime.datetime.utcfromtimestamp(1640995201),
-                    datetime.datetime.utcfromtimestamp(1640995202),
-                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": ["11", "21"],
             }
         )
-        # convert timestamps to current timezone
-        local_tz = tzlocal.get_localzone()
-        expected_df_localized = expected_df.copy(True)
-        expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize(
-            str(local_tz)
-        )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
-            expected_df_localized, schema=expected_schema
+            expected_df, schema=expected_schema
         )
 
         # Arrange
-        def tf_fun(a) -> datetime.datetime:
-            return datetime.datetime.utcfromtimestamp(a + 1)
-
-        td = self._create_training_dataset(tf_fun, "datetime")
+        @hopsworks_udf(str)
+        def tf_fun(col_0):
+            return col_0 + "1"
+
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_datetime_tz_utc(self, mocker):
+    def test_apply_plus_one_double(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
@@ -610,127 +644,143 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker):
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1640995200, 1640995201],
+                "col_0": [1, 2],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
         )
-        spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         expected_schema = StructType(
             [
-                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", DoubleType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": [
-                    datetime.datetime.utcfromtimestamp(1640995201),
-                    datetime.datetime.utcfromtimestamp(1640995202),
-                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [2.0, 3.0],
             }
         )
-        # convert timestamps to current timezone
-        local_tz = tzlocal.get_localzone()
-        expected_df_localized = expected_df.copy(True)
-        expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize(
-            str(local_tz)
-        )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
-            expected_df_localized, schema=expected_schema
+            expected_df, schema=expected_schema
         )
+        spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         # Arrange
-        def tf_fun(a) -> datetime.datetime:
-            return datetime.datetime.utcfromtimestamp(a + 1).replace(
-                tzinfo=datetime.timezone.utc
+        @hopsworks_udf(float)
+        def tf_fun(col_0):
+            return col_0 + 1.0
+
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
             )
-
-        td = self._create_training_dataset(tf_fun, "datetime")
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_datetime_tz_pst(self, mocker):
+    def test_apply_plus_one_datetime_no_tz(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
 
         schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
+                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
             ]
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1640995200, 1640995201],
+                "col_0": [
+                    datetime.datetime.utcfromtimestamp(1640995200),
+                    datetime.datetime.utcfromtimestamp(1640995201),
+                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
         )
+
         spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         expected_schema = StructType(
             [
-                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", TimestampType(), True),
             ]
         )
-
         expected_df = pd.DataFrame(
             data={
-                "col_0": [
-                    datetime.datetime.utcfromtimestamp(1641024001),
-                    datetime.datetime.utcfromtimestamp(1641024002),
-                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [
+                    datetime.datetime.utcfromtimestamp(1640995200)
+                    + datetime.timedelta(milliseconds=1),
+                    datetime.datetime.utcfromtimestamp(1640995201)
+                    + datetime.timedelta(milliseconds=1),
+                ],
             }
         )
         # convert timestamps to current timezone
         local_tz = tzlocal.get_localzone()
         expected_df_localized = expected_df.copy(True)
-        expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize(
-            str(local_tz)
-        )
+        expected_df_localized["tf_fun_col_0_"] = expected_df_localized[
+            "tf_fun_col_0_"
+        ].dt.tz_localize(str(local_tz))
         expected_spark_df = spark_engine._spark_session.createDataFrame(
             expected_df_localized, schema=expected_schema
         )
 
         # Arrange
-        def tf_fun(a) -> datetime.datetime:
-            pdt = pytz.timezone("US/Pacific")
-            return pdt.localize(datetime.datetime.utcfromtimestamp(a + 1))
+        @hopsworks_udf(datetime.datetime)
+        def tf_fun(col_0):
+            import datetime
+
+            return col_0 + datetime.timedelta(milliseconds=1)
 
-        td = self._create_training_dataset(tf_fun, "datetime")
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(
+            td, df, expected_df_localized, transformation_functions
+        )
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_datetime_ts_none(self, mocker):
+    def test_apply_plus_one_datetime_tz_utc(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
 
         schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
+                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
             ]
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1640995200, 1640995201],
+                "col_0": [
+                    datetime.datetime.utcfromtimestamp(1640995200),
+                    datetime.datetime.utcfromtimestamp(1640995201),
+                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
@@ -739,59 +789,75 @@ def test_apply_plus_one_datetime_ts_none(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", TimestampType(), True),
             ]
         )
-
         expected_df = pd.DataFrame(
             data={
-                "col_0": [
-                    None,
-                    datetime.datetime.utcfromtimestamp(1640995202),
-                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [
+                    datetime.datetime.utcfromtimestamp(1640995200)
+                    + datetime.timedelta(milliseconds=1),
+                    datetime.datetime.utcfromtimestamp(1640995201)
+                    + datetime.timedelta(milliseconds=1),
+                ],
             }
         )
         # convert timestamps to current timezone
         local_tz = tzlocal.get_localzone()
         expected_df_localized = expected_df.copy(True)
-        expected_df_localized["col_0"] = expected_df_localized["col_0"].dt.tz_localize(
-            str(local_tz)
-        )
+        expected_df_localized["tf_fun_col_0_"] = expected_df_localized[
+            "tf_fun_col_0_"
+        ].dt.tz_localize(str(local_tz))
         expected_spark_df = spark_engine._spark_session.createDataFrame(
             expected_df_localized, schema=expected_schema
         )
 
         # Arrange
-        def tf_fun(a) -> datetime.datetime:
-            return (
-                None if a == 1640995200 else datetime.datetime.utcfromtimestamp(a + 1)
+        @hopsworks_udf(datetime.datetime)
+        def tf_fun(col_0) -> datetime.datetime:
+            import datetime
+
+            return (col_0 + datetime.timedelta(milliseconds=1)).dt.tz_localize(
+                datetime.timezone.utc
             )
 
-        td = self._create_training_dataset(tf_fun, "datetime")
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(
+            td, df, expected_df_localized, transformation_functions
+        )
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_date(self, mocker):
+    def test_apply_plus_one_datetime_tz_pst(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
 
         schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
+                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
             ]
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1641045600, 1641132000],
+                "col_0": [
+                    datetime.datetime.utcfromtimestamp(1640995200),
+                    datetime.datetime.utcfromtimestamp(1640995201),
+                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
@@ -800,50 +866,77 @@ def test_apply_plus_one_date(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", DateType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", TimestampType(), True),
             ]
         )
+
         expected_df = pd.DataFrame(
             data={
-                "col_0": [
-                    datetime.datetime.utcfromtimestamp(1641045601).date(),
-                    datetime.datetime.utcfromtimestamp(1641132001).date(),
-                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [
+                    datetime.datetime.utcfromtimestamp(1640995200)
+                    + datetime.timedelta(milliseconds=1),
+                    datetime.datetime.utcfromtimestamp(1640995201)
+                    + datetime.timedelta(milliseconds=1),
+                ],
             }
         )
+        # convert timestamps to current timezone
+        local_tz = tzlocal.get_localzone()
+        expected_df_localized = expected_df.copy(True)
+        expected_df_localized["tf_fun_col_0_"] = expected_df_localized[
+            "tf_fun_col_0_"
+        ].dt.tz_localize(str(local_tz))
         expected_spark_df = spark_engine._spark_session.createDataFrame(
-            expected_df, schema=expected_schema
+            expected_df_localized, schema=expected_schema
         )
 
         # Arrange
-        def tf_fun(a) -> datetime.datetime:
-            return datetime.datetime.utcfromtimestamp(a + 1)
+        @hopsworks_udf(datetime.datetime)
+        def tf_fun(col_0) -> datetime.datetime:
+            import datetime
+
+            import pytz
 
-        td = self._create_training_dataset(tf_fun, "date")
+            pdt = pytz.timezone("US/Pacific")
+            return (col_0 + datetime.timedelta(milliseconds=1)).dt.tz_localize(pdt)
+
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(
+            td, df, expected_df_localized, transformation_functions
+        )
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_no_type(self, mocker):
+    def test_apply_plus_one_datetime_ts_none(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
 
         schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
+                StructField("col_0", TimestampType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
             ]
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1, 2],
+                "col_0": [
+                    datetime.datetime.utcfromtimestamp(1640995200),
+                    datetime.datetime.utcfromtimestamp(1640995201),
+                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
@@ -852,47 +945,78 @@ def test_apply_plus_one_no_type(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", StringType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", TimestampType(), True),
             ]
         )
+
         expected_df = pd.DataFrame(
             data={
-                "col_0": ["2", "3"],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [
+                    None,
+                    datetime.datetime.utcfromtimestamp(1640995201)
+                    + datetime.timedelta(milliseconds=1),
+                ],
             }
         )
+        # convert timestamps to current timezone
+        local_tz = tzlocal.get_localzone()
+        expected_df_localized = expected_df.copy(True)
+        expected_df_localized["tf_fun_col_0_"] = expected_df_localized[
+            "tf_fun_col_0_"
+        ].dt.tz_localize(str(local_tz))
         expected_spark_df = spark_engine._spark_session.createDataFrame(
-            expected_df, schema=expected_schema
+            expected_df_localized, schema=expected_schema
         )
 
         # Arrange
-        def tf_fun(a) -> int:
-            return a + 1
+        @hopsworks_udf(datetime.datetime)
+        def tf_fun(col_0) -> datetime.datetime:
+            import datetime
+
+            return pd.Series(
+                None
+                if data == datetime.datetime.utcfromtimestamp(1640995200)
+                else data + datetime.timedelta(milliseconds=1)
+                for data in col_0
+            )
 
-        td = self._create_training_dataset(tf_fun)
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(
+            td, df, expected_df_localized, transformation_functions
+        )
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_empty_type(self, mocker):
+    def test_apply_plus_one_date(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
         spark_engine = spark.Engine()
 
         schema = StructType(
             [
-                StructField("col_0", IntegerType(), True),
+                StructField("col_0", DateType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
             ]
         )
         df = pd.DataFrame(
             data={
-                "col_0": [1, 2],
+                "col_0": [
+                    datetime.datetime.utcfromtimestamp(1641045600).date(),
+                    datetime.datetime.utcfromtimestamp(1641132000).date(),
+                ],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
             }
@@ -901,16 +1025,21 @@ def test_apply_plus_one_empty_type(self, mocker):
 
         expected_schema = StructType(
             [
-                StructField("col_0", StringType(), True),
                 StructField("col_1", StringType(), True),
                 StructField("col_2", BooleanType(), True),
+                StructField("tf_fun_col_0_", DateType(), True),
             ]
         )
         expected_df = pd.DataFrame(
             data={
-                "col_0": ["2", "3"],
                 "col_1": ["test_1", "test_2"],
                 "col_2": [True, False],
+                "tf_fun_col_0_": [
+                    datetime.datetime.utcfromtimestamp(1641045600).date()
+                    + datetime.timedelta(days=1),
+                    datetime.datetime.utcfromtimestamp(1641132000).date()
+                    + datetime.timedelta(days=1),
+                ],
             }
         )
         expected_spark_df = spark_engine._spark_session.createDataFrame(
@@ -918,26 +1047,37 @@ def test_apply_plus_one_empty_type(self, mocker):
         )
 
         # Arrange
-        def tf_fun(a) -> int:
-            return a + 1
+        @hopsworks_udf(datetime.date)
+        def tf_fun(col_0):
+            import datetime
 
-        td = self._create_training_dataset(tf_fun, "")
+            return col_0 + datetime.timedelta(days=1)
+
+        td = self._create_training_dataset()
+        transformation_functions = [
+            transformation_function.TransformationFunction(
+                hopsworks_udf=tf_fun, featurestore_id=99
+            )
+        ]
 
         # Assert
-        self._validate_on_python_engine(td, df, expected_df)
-        self._validate_on_spark_engine(td, spark_df, expected_spark_df)
+        self._validate_on_python_engine(td, df, expected_df, transformation_functions)
+        self._validate_on_spark_engine(
+            td, spark_df, expected_spark_df, transformation_functions
+        )
 
-    def test_apply_plus_one_date_not_supported_type(self, mocker):
+    def test_apply_plus_one_invalid_type(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
 
         # Arrange
-        def tf_fun(a) -> int:
-            return a + 1
+        with pytest.raises(FeatureStoreException) as e_info:
 
-        # Act
-        with pytest.raises(TypeError) as e_info:
-            self._create_training_dataset(tf_fun, list)
+            @hopsworks_udf(list)
+            def tf_fun(a):
+                return a + 1
 
-        # Assert
-        assert str(e_info.value) == "Not supported type <class 'list'>."
+        assert (
+            str(e_info.value)
+            == f"Output type {list} is not supported. Please refer to the documentation to get more information on the supported types."
+        )
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index 92601b46da..e515e0d0df 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -684,9 +684,30 @@
       "id": 11,
       "version": 1,
       "description": "test_description",
-      "transformation_functions": {
-        "featurestore_id": 5
-      },
+      "transformation_functions": [
+          {
+            "id" : 1,
+            "version": 2,
+            "featurestoreId": 11,
+            "hopsworksUdf":{
+              "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+              "name": "add_mean_fs",
+              "outputTypes":"double",
+              "transformationFeatures":"data"
+            }
+          },
+          {
+            "id" : 2,
+            "version": 1,
+            "featurestoreId": 11,
+            "hopsworksUdf":{
+              "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+              "name": "add_one_fs",
+              "outputTypes":"double",
+              "transformationFeatures":"col1"
+            }
+          }
+      ],
       "features": [
         {
           "name": "intt",
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 5b8e753508..169d779bd6 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -80,6 +80,24 @@
       ]
     }
   },
+  "get_list_one_argument": {
+    "response": {
+      "count": 1,
+      "items": [
+        {
+          "id" : 1,
+          "version": 2,
+          "featurestoreId": 11,
+          "hopsworksUdf":{
+            "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+            "name": "add_mean_fs",
+            "outputTypes":"double",
+            "transformationFeatures":"data"
+          }
+        }
+      ]
+    }
+  },
   "get_list_empty": {
     "response": {
       "count": 0,
diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py
index 5fdea2987f..b54fbdbe6b 100644
--- a/python/tests/test_transformation_function.py
+++ b/python/tests/test_transformation_function.py
@@ -171,6 +171,30 @@ def test_from_response_json_list(self, backend_fixtures):
             == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
 
+    def test_from_response_json_list_one_argument(self, backend_fixtures):
+        # Arrange
+        json = backend_fixtures["transformation_function"]["get_list_one_argument"][
+            "response"
+        ]
+
+        # Act
+        tf = TransformationFunction.from_response_json(json)
+
+        # Assert
+        assert not isinstance(tf, list)
+        assert tf.id == 1
+        assert tf._featurestore_id == 11
+        assert tf.version == 2
+        assert tf.hopsworks_udf.function_name == "add_mean_fs"
+        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.statistics_required
+        assert tf.hopsworks_udf.transformation_features == ["data"]
+        assert tf.hopsworks_udf.statistics_features == ["data"]
+        assert (
+            tf.hopsworks_udf._function_source
+            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+        )
+
     def test_transformation_function_definition_no_hopworks_udf(self):
         def test(col1):
             return col1 + 1

From 3fc94f883ced13eb223ad079252b7460630a0ac1 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 13:30:53 +0200
Subject: [PATCH 20/58] removed print

---
 python/hsfs/hopsworks_udf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 9ed60ead0d..912c7e1456 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -432,7 +432,6 @@ def renaming_wrapper(*args):
         df = convert_timezone(df)
     return df"""
             )
-        print(code)
         # injecting variables into scope used to execute wrapper function.
         scope = __import__("__main__").__dict__
         if self.transformation_statistics is not None:

From 2bf5f2059ff51992d60fc96f731b8b9a8619e8fe Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 14:06:47 +0200
Subject: [PATCH 21/58] adding test for hopsworks_udf

---
 python/hsfs/hopsworks_udf.py                  |   3 +-
 python/pyproject.toml                         |   3 +-
 python/tests/pyproject.toml                   |   4 +-
 python/tests/test_helpers/__init__.py         |   0
 .../transformation_test_helper.py             |  92 ++++
 python/tests/test_hopswork_udf.py             | 503 ++++++++++++++++++
 6 files changed, 599 insertions(+), 6 deletions(-)
 create mode 100644 python/tests/test_helpers/__init__.py
 create mode 100644 python/tests/test_helpers/transformation_test_helper.py
 create mode 100644 python/tests/test_hopswork_udf.py

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 912c7e1456..9b3b332812 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -278,6 +278,7 @@ def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, in
             ]
         )
         arg_list = signature.split("(")[1].split(")")[0].split(",")
+        arg_list = [arg for arg in arg_list if not arg.strip() == ""]
         return arg_list, signature, signature_start_line, signature_end_line
 
     @staticmethod
@@ -293,7 +294,7 @@ def _extract_function_arguments(source_code: str) -> List[TransformationFeature]
         # Get source code of the original function
         arg_list, _, _, _ = HopsworksUdf._parse_function_signature(source_code)
 
-        if arg_list == [""]:
+        if arg_list == []:
             raise FeatureStoreException(
                 "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function."
             )
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 1ad6c8c5f4..77fe01a61f 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -142,8 +142,7 @@ exclude = [
     "node_modules",
     "site-packages",
     "venv",
-    "java",
-    "python/tests/transformations_test_helper/" # transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases
+    "java"
 ]
 
 # Same as Black.
diff --git a/python/tests/pyproject.toml b/python/tests/pyproject.toml
index 3d36a4588e..050735f853 100644
--- a/python/tests/pyproject.toml
+++ b/python/tests/pyproject.toml
@@ -8,8 +8,6 @@ ignore = [
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]
 unfixable = []
-# transformations_test_helper excluded from fomating and linting because the used formating is required for the test cases
-exclude = ["transformations_test_helper/"]
 
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
@@ -36,4 +34,4 @@ line-ending = "auto"
 pythonpath = [
   ".", "tests"
 ]
-addopts = "--ignore=python/tests/transformations_test_helper/"
+addopts = "--ignore=python/tests/test_helper/"
diff --git a/python/tests/test_helpers/__init__.py b/python/tests/test_helpers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/tests/test_helpers/transformation_test_helper.py b/python/tests/test_helpers/transformation_test_helper.py
new file mode 100644
index 0000000000..8b81c48fde
--- /dev/null
+++ b/python/tests/test_helpers/transformation_test_helper.py
@@ -0,0 +1,92 @@
+import pandas as pd
+from hsfs.statistics import FeatureDescriptiveStatistics
+
+
+def test_function():
+    return True
+
+
+def test_function_one_argument(arg1):
+    pass
+
+
+def test_function_one_argument_with_statistics(arg1, statistics_arg1):
+    pass
+
+
+def test_function_one_argument_with_typehints(arg1: pd.Series):
+    pass
+
+
+def test_function_one_argument_with_statistics_and_typehints(
+    arg1: pd.Series, statistics_arg1: FeatureDescriptiveStatistics
+):
+    pass
+
+
+def test_function_multiple_argument(arg1, arg2):
+    pass
+
+
+def test_function_multiple_argument_with_statistics(
+    arg1, arg2, arg3, statistics_arg1, statistics_arg3
+):
+    pass
+
+
+def test_function_multiple_argument_with_typehints(arg1: pd.Series, arg2: pd.Series):
+    pass
+
+
+def test_function_multiple_argument_with_statistics_and_typehints(
+    arg1: pd.Series,
+    arg2: pd.Series,
+    statistics_arg1: FeatureDescriptiveStatistics,
+    statistics_arg2: FeatureDescriptiveStatistics,
+):
+    pass
+
+
+def test_function_multiple_argument_with_mixed_statistics_and_typehints(
+    arg1: pd.Series,
+    arg2,
+    arg3,
+    statistics_arg1,
+    statistics_arg3: FeatureDescriptiveStatistics,
+):
+    pass
+
+
+def test_function_multiple_argument_all_parameter_with_spaces(
+    arg1: pd.Series,
+    arg2,
+    statistics_arg1,
+    statistics_arg2: FeatureDescriptiveStatistics,
+):
+    pass
+
+
+def test_function_multiple_argument_all_parameter_multiline(
+    arg1: pd.Series,
+    arg2,
+    statistics_arg1,
+    arg3,
+    statistics_arg3: FeatureDescriptiveStatistics,
+):
+    pass
+
+
+def test_function_multiple_argument_all_parameter_multiline_with_comments(
+    arg1: pd.Series,  # Test Comment
+    arg2,
+    statistics_arg1,  # Test Comment
+    arg3,
+    statistics_arg3: FeatureDescriptiveStatistics,
+) -> pd.DataFrame:  # Test Comment
+    pass
+
+
+def test_function_statistics_invalid(
+    arg1: pd.Series, statistics_arg3: FeatureDescriptiveStatistics
+):
+    pass
diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
new file mode 100644
index 0000000000..04dab45309
--- /dev/null
+++ b/python/tests/test_hopswork_udf.py
@@ -0,0 +1,503 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+from datetime import date, datetime, time
+
+import pandas as pd
+import pytest
+from hsfs.client.exceptions import FeatureStoreException
+from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, hopsworks_udf
+
+
+class TestHopsworksUdf:
+    def test_validate_and_convert_output_types_one_elements(self):
+        assert HopsworksUdf._validate_and_convert_output_types([int]) == ["bigint"]
+
+        assert HopsworksUdf._validate_and_convert_output_types([float]) == ["double"]
+
+        assert HopsworksUdf._validate_and_convert_output_types([str]) == ["string"]
+
+        assert HopsworksUdf._validate_and_convert_output_types([bool]) == ["boolean"]
+
+        assert HopsworksUdf._validate_and_convert_output_types([datetime]) == [
+            "timestamp"
+        ]
+
+        assert HopsworksUdf._validate_and_convert_output_types([time]) == ["timestamp"]
+
+        assert HopsworksUdf._validate_and_convert_output_types([date]) == ["date"]
+
+        with pytest.raises(FeatureStoreException) as exception:
+            HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype])
+
+        assert (
+            str(exception.value)
+            == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types."
+        )
+
+    def test_validate_and_convert_output_types_multiple_types(self):
+        assert HopsworksUdf._validate_and_convert_output_types(
+            [int, float, str, bool, datetime, date, time]
+        ) == ["bigint", "double", "string", "boolean", "timestamp", "date", "timestamp"]
+
+        assert HopsworksUdf._validate_and_convert_output_types(
+            ["bigint", "double", "string", "boolean", "timestamp", "date"]
+        ) == ["bigint", "double", "string", "boolean", "timestamp", "date"]
+
+        with pytest.raises(FeatureStoreException) as exception:
+            HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype])
+
+        assert (
+            str(exception.value)
+            == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types."
+        )
+
+    def test_validate_and_convert_output_types_invalid_types(self):
+        with pytest.raises(FeatureStoreException) as exception:
+            HopsworksUdf._validate_and_convert_output_types([pd.DatetimeTZDtype])
+
+        assert (
+            str(exception.value)
+            == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types."
+        )
+
+        with pytest.raises(FeatureStoreException) as exception:
+            HopsworksUdf._validate_and_convert_output_types([int, pd.DatetimeTZDtype])
+
+        assert (
+            str(exception.value)
+            == f"Output type {pd.DatetimeTZDtype} is not supported. Please refer to the documentation to get more information on the supported types."
+        )
+
+        with pytest.raises(FeatureStoreException) as exception:
+            HopsworksUdf._validate_and_convert_output_types([int, "pd.DatetimeTZDtype"])
+
+        assert (
+            str(exception.value)
+            == "Output type pd.DatetimeTZDtype is not supported. Please refer to the documentation to get more information on the supported types."
+        )
+
+    def test_get_module_imports(self):
+        assert HopsworksUdf._get_module_imports(
+            "python/tests/test_helpers/transformation_test_helper.py"
+        ) == [
+            "import pandas as pd",
+            "from hsfs.statistics import FeatureDescriptiveStatistics",
+        ]
+
+    def test_extract_source_code(self):
+        from test_helpers.transformation_test_helper import test_function
+
+        assert """import pandas as pd
+from hsfs.statistics import FeatureDescriptiveStatistics
+def test_function():
+    return True""" == HopsworksUdf._extract_source_code(test_function).strip()
+
+    def test_extract_function_arguments_no_arguments(self):
+        from test_helpers.transformation_test_helper import test_function
+
+        with pytest.raises(FeatureStoreException) as exception:
+            function_source = HopsworksUdf._extract_source_code(test_function)
+            HopsworksUdf._extract_function_arguments(function_source)
+
+        assert (
+            str(exception.value)
+            == "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function."
+        )
+
+    def test_extract_function_arguments_one_argument(self):
+        from test_helpers.transformation_test_helper import test_function_one_argument
+
+        function_source = HopsworksUdf._extract_source_code(test_function_one_argument)
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(feature_name="arg1", statistic_argument_name=None)
+        ]
+
+    def test_extract_function_arguments_one_argument_with_statistics(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_one_argument_with_statistics,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_one_argument_with_statistics
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            )
+        ]
+
+    def test_extract_function_arguments_one_argument_with_typehint(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_one_argument_with_typehints,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_one_argument_with_typehints
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(feature_name="arg1", statistic_argument_name=None)
+        ]
+
+    def test_extract_function_arguments_one_argument_with_statistics_and_typehints(
+        self,
+    ):
+        from test_helpers.transformation_test_helper import (
+            test_function_one_argument_with_statistics_and_typehints,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_one_argument_with_statistics_and_typehints
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            )
+        ]
+
+    def test_extract_function_arguments_multiple_argument(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(feature_name="arg1", statistic_argument_name=None),
+            TransformationFeature(feature_name="arg2", statistic_argument_name=None),
+        ]
+
+    def test_extract_function_arguments_multiple_argument_with_statistics(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_with_statistics,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_with_statistics
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            ),
+            TransformationFeature(feature_name="arg2", statistic_argument_name=None),
+            TransformationFeature(
+                feature_name="arg3", statistic_argument_name="statistics_arg3"
+            ),
+        ]
+
+    def test_extract_function_arguments_multiple_argument_with_typehints(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_with_typehints,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_with_typehints
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(feature_name="arg1", statistic_argument_name=None),
+            TransformationFeature(feature_name="arg2", statistic_argument_name=None),
+        ]
+
+    def test_extract_function_arguments_multiple_argument_with_statistics_and_typehints(
+        self,
+    ):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_with_statistics_and_typehints,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_with_statistics_and_typehints
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            ),
+            TransformationFeature(
+                feature_name="arg2", statistic_argument_name="statistics_arg2"
+            ),
+        ]
+
+    def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_typehints(
+        self,
+    ):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_with_mixed_statistics_and_typehints,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_with_mixed_statistics_and_typehints
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            ),
+            TransformationFeature(feature_name="arg2", statistic_argument_name=None),
+            TransformationFeature(
+                feature_name="arg3", statistic_argument_name="statistics_arg3"
+            ),
+        ]
+
+    def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces(
+        self,
+    ):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_all_parameter_with_spaces,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_all_parameter_with_spaces
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            ),
+            TransformationFeature(
+                feature_name="arg2", statistic_argument_name="statistics_arg2"
+            ),
+        ]
+
+    def test_extract_function_arguments_multiple_argument_all_parameter_multiline(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_all_parameter_multiline,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_all_parameter_multiline
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            ),
+            TransformationFeature(feature_name="arg2", statistic_argument_name=None),
+            TransformationFeature(
+                feature_name="arg3", statistic_argument_name="statistics_arg3"
+            ),
+        ]
+
+    def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_with_comments(
+        self,
+    ):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_all_parameter_multiline_with_comments,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_all_parameter_multiline_with_comments
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+
+        assert function_argument == [
+            TransformationFeature(
+                feature_name="arg1", statistic_argument_name="statistics_arg1"
+            ),
+            TransformationFeature(feature_name="arg2", statistic_argument_name=None),
+            TransformationFeature(
+                feature_name="arg3", statistic_argument_name="statistics_arg3"
+            ),
+        ]
+
+    def test_extract_function_arguments_statistics_invalid(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_statistics_invalid,
+        )
+
+        with pytest.raises(FeatureStoreException) as exception:
+            function_source = HopsworksUdf._extract_source_code(
+                test_function_statistics_invalid
+            )
+            HopsworksUdf._extract_function_arguments(function_source)
+
+        assert (
+            str(exception.value)
+            == "No argument corresponding to statistics parameter 'statistics_arg3' present in function definition."
+        )
+
+    def test_format_source_code(self):
+        from test_helpers.transformation_test_helper import (
+            test_function_multiple_argument_all_parameter_multiline_with_comments,
+        )
+
+        function_source = HopsworksUdf._extract_source_code(
+            test_function_multiple_argument_all_parameter_multiline_with_comments
+        )
+        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+        print("\n")
+        print(function_argument)
+        formated_source, module_imports = HopsworksUdf._format_source_code(
+            function_source, function_argument
+        )
+        print(formated_source)
+        assert (
+            formated_source.strip()
+            == """def test_function_multiple_argument_all_parameter_multiline_with_comments(arg1, arg2, arg3):
+\t    pass"""
+        )
+
+    def test_generate_output_column_names_one_argument_one_output_type(self):
+        @hopsworks_udf(int)
+        def test_func(col1):
+            return col1 + 1
+
+        assert test_func._get_output_column_names() == ["test_func_col1_"]
+
+    def test_generate_output_column_names_multiple_argument_one_output_type(self):
+        @hopsworks_udf(int)
+        def test_func(col1, col2, col3):
+            return col1 + 1
+
+        assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"]
+
+    def test_generate_output_column_names_single_argument_multiple_output_type(self):
+        @hopsworks_udf([int, float, int])
+        def test_func(col1):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]}
+            )
+
+        assert test_func._get_output_column_names() == [
+            "test_func_col1_0",
+            "test_func_col1_1",
+            "test_func_col1_2",
+        ]
+
+    def test_generate_output_column_names_multiple_argument_multiple_output_type(self):
+        @hopsworks_udf([int, float, int])
+        def test_func(col1, col2, col3):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+            )
+
+        assert test_func._get_output_column_names() == [
+            "test_func_col1-col2-col3_0",
+            "test_func_col1-col2-col3_1",
+            "test_func_col1-col2-col3_2",
+        ]
+
+    def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
+        @hopsworks_udf(int)
+        def test_func(col1):
+            return col1 + 1
+
+        assert test_func._create_pandas_udf_return_schema_from_list() == "bigint"
+
+    def test_create_pandas_udf_return_schema_from_list_one_argument_multiple_output_type(
+        self,
+    ):
+        @hopsworks_udf([int, float, str, date, datetime, time, bool])
+        def test_func(col1):
+            return pd.DataFrame(
+                {
+                    "col1": [col1 + 1],
+                    "col2": [col1 + 1],
+                    "col3": [col1 + 1],
+                    "col4": [col1 + 1],
+                    "col5": [col1 + 1],
+                    "col6": [True],
+                }
+            )
+
+        assert (
+            test_func._create_pandas_udf_return_schema_from_list()
+            == "`test_func_col1_0` bigint, `test_func_col1_1` double, `test_func_col1_2` string, `test_func_col1_3` date, `test_func_col1_4` timestamp, `test_func_col1_5` timestamp, `test_func_col1_6` boolean"
+        )
+
+    def test_hopsworks_wrapper_single_output(self):
+        @hopsworks_udf(int)
+        def test_func(col1):
+            return col1 + 1
+
+        renaming_wrapper_function = test_func.hopsworksUdf_wrapper()
+
+        test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]})
+
+        result = renaming_wrapper_function(test_dataframe["col1"])
+
+        assert result.name == "test_func_col1_"
+        assert result.values.tolist() == [2, 3, 4, 5]
+
+    def test_hopsworks_wrapper_multiple_output(self):
+        @hopsworks_udf([int, float])
+        def test_func(col1, col2):
+            return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2})
+
+        renaming_wrapper_function = test_func.hopsworksUdf_wrapper()
+
+        test_dataframe = pd.DataFrame(
+            {"column1": [1, 2, 3, 4], "column2": [10, 20, 30, 40]}
+        )
+
+        result = renaming_wrapper_function(
+            test_dataframe["column1"], test_dataframe["column2"]
+        )
+
+        assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"])
+        assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]]
+
+    def test_HopsworkUDf_call_one_argument(self):
+        @hopsworks_udf(int)
+        def test_func(col1):
+            return col1 + 1
+
+        assert test_func.transformation_features == ["col1"]
+        assert test_func.statistics_features == []
+
+        assert test_func("new_feature").transformation_features == ["new_feature"]
+        assert test_func("new_feature").statistics_features == []
+
+    def test_HopsworkUDf_call_one_argument_statistics(self):
+        @hopsworks_udf(int)
+        def test_func(col1, statistics_col1):
+            return col1 + statistics_col1
+
+        assert test_func.transformation_features == ["col1"]
+        assert test_func.statistics_features == ["col1"]
+
+        assert test_func("new_feature").transformation_features == ["new_feature"]
+        assert test_func("new_feature").statistics_features == ["new_feature"]
+
+    def test_HopsworkUDf_call_multiple_argument_statistics(self):
+        @hopsworks_udf(int)
+        def test_func(col1, statistics_col1, col2, col3, statistics_col3):
+            return col1 + statistics_col1
+
+        assert test_func.transformation_features == ["col1", "col2", "col3"]
+        assert test_func.statistics_features == ["col1", "col3"]
+
+        assert test_func("f1", "f2", "f3").transformation_features == ["f1", "f2", "f3"]
+        assert test_func("f1", "f2", "f3").statistics_features == ["f1", "f3"]

From 594640cfb5d6f70fb1a2111232a1b16f101fe8fd Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 15:00:31 +0200
Subject: [PATCH 22/58] correcting merge for vector server

---
 .../core/transformation_function_engine.py    |  15 --
 python/hsfs/core/vector_server.py             |  20 +-
 .../tests/fixtures/feature_view_fixtures.json | 228 ++++++++++++++++++
 3 files changed, 233 insertions(+), 30 deletions(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 2396cb1a03..0384e05ac9 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -29,21 +29,6 @@
     from hsfs.statistics import Statistics
     from hsfs.transformation_function import TransformationFunction
 
-from hsfs import (
-    feature_view,
-    statistics,
-    training_dataset,
-    training_dataset_feature,
-    transformation_function_attached,
-    util,
-)
-from hsfs.core import (
-    feature_view_api,
-    statistics_api,
-    statistics_engine,
-    transformation_function_api,
-)
-from hsfs.core.builtin_transformation_function import BuiltInTransformationFunction
 
 class TransformationFunctionEngine:
     BUILTIN_FN_NAMES = [
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index c6cd5959bd..1ef1df0854 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -38,19 +38,14 @@
 from hsfs import (
     training_dataset_feature as tdf_mod,
 )
-from hsfs import (
-    transformation_function_attached as tfa_mod,
-)
 from hsfs.client import exceptions, online_store_rest_client
 from hsfs.core import (
     online_store_rest_client_engine,
     online_store_sql_engine,
-)
-from hsfs.core import (
     transformation_function_engine as tf_engine_mod,
+    transformation_functions
 )
 
-
 HAS_FASTAVRO = False
 try:
     from fastavro import schemaless_reader
@@ -104,7 +99,6 @@ def __init__(
             feat.name for feat in features if feat.inference_helper_column
         ]
         self._transformed_feature_vector_col_name: List[str] = None
-
         self._skip_fg_ids = skip_fg_ids or set()
         self._serving_keys = serving_keys or []
         self._required_serving_keys = []
@@ -112,9 +106,8 @@ def __init__(
         self._transformation_function_engine = (
             tf_engine_mod.TransformationFunctionEngine(feature_store_id)
         )
-        self._transformation_functions: Dict[
-            str, tfa_mod.TransformationFunctionAttached
-        ] = {}
+        self._transformation_functions: List[transformation_functions.TransformationFunction] = []
+
         self._sql_client = None
 
         self._rest_client_engine = None
@@ -301,7 +294,6 @@ def get_feature_vectors(
         """Assembles serving vector from online feature store."""
         if passed_features is None:
             passed_features = []
-
         # Assertions on passed_features and vector_db_features
         assert (
             passed_features is None
@@ -573,7 +565,7 @@ def get_inference_helpers(
         return self.handle_feature_vector_return_type(
             batch_results, batch=True, inference_helper=True, return_type=return_type
         )
-
+    
 
     def which_client_and_ensure_initialised(
         self, force_rest_client: bool, force_sql_client: bool
@@ -1005,9 +997,7 @@ def per_serving_key_features(self) -> Dict[str, set[str]]:
     @property
     def transformation_functions(
         self,
-    ) -> Dict[str, tfa_mod.TransformationFunctionAttached]:
-        if self._transformation_functions is None:
-            self._transformation_functions = {}
+    ) -> Optional[List[transformation_functions.TransformationFunction]]:
         return self._transformation_functions
 
     @property
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index e515e0d0df..da5c7766ed 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -762,5 +762,233 @@
         }
       ]
     }
+  },
+  "get_transformations": {
+      "response": {
+        "name": "test_name",
+        "query": {
+          "left_feature_group": {
+            "type": "cachedFeaturegroupDTO",
+            "validation_type": "test_validation_type",
+            "created": "2022-08-01T11:07:55Z",
+            "creator": {
+              "email": "admin@hopsworks.ai",
+              "firstName": "Admin",
+              "lastName": "Admin",
+              "maxNumProjects": 0,
+              "numActiveProjects": 0,
+              "numRemainingProjects": 0,
+              "status": 0,
+              "testUser": false,
+              "tos": false,
+              "toursState": 0,
+              "twoFactor": false
+            },
+            "description": "test_description",
+            "featurestoreId": 67,
+            "featurestoreName": "test_featurestore",
+            "id": 15,
+            "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1",
+            "name": "fg_test",
+            "statisticsConfig": {
+              "columns": [],
+              "correlations": false,
+              "enabled": true,
+              "exactUniqueness": false,
+              "histograms": false
+            },
+            "version": 1,
+            "features": [
+              {
+                "defaultValue": null,
+                "featureGroupId": 15,
+                "hudiPrecombineKey": true,
+                "name": "intt",
+                "onlineType": "int",
+                "partition": false,
+                "primary": true,
+                "type": "int"
+              },
+              {
+                "defaultValue": null,
+                "featureGroupId": 15,
+                "hudiPrecombineKey": false,
+                "name": "stringt",
+                "onlineType": "varchar(1000)",
+                "partition": false,
+                "primary": false,
+                "type": "string"
+              }
+            ],
+            "onlineTopicName": "119_15_fg_test_1_onlinefs",
+            "onlineEnabled": true,
+            "timeTravelFormat": "HUDI"
+          },
+          "left_features": ["intt"],
+          "feature_store_name": "test_feature_store_name",
+          "feature_store_id": 67,
+          "left_feature_group_start_time": "test_start_time",
+          "left_feature_group_end_time": "test_end_time",
+          "joins": [
+            {
+              "query": {
+                "left_feature_group": {
+                  "type": "cachedFeaturegroupDTO",
+                  "validation_type": "test_validation_type",
+                  "created": "2022-08-01T11:07:55Z",
+                  "creator": {
+                    "email": "admin@hopsworks.ai",
+                    "firstName": "Admin",
+                    "lastName": "Admin",
+                    "maxNumProjects": 0,
+                    "numActiveProjects": 0,
+                    "numRemainingProjects": 0,
+                    "status": 0,
+                    "testUser": false,
+                    "tos": false,
+                    "toursState": 0,
+                    "twoFactor": false
+                  },
+                  "description": "test_description",
+                  "featurestoreId": 67,
+                  "featurestoreName": "test_featurestore",
+                  "id": 15,
+                  "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1",
+                  "name": "fg_test",
+                  "statisticsConfig": {
+                    "columns": [],
+                    "correlations": false,
+                    "enabled": true,
+                    "exactUniqueness": false,
+                    "histograms": false
+                  },
+                  "version": 1,
+                  "features": [
+                    {
+                      "defaultValue": null,
+                      "featureGroupId": 15,
+                      "hudiPrecombineKey": true,
+                      "name": "intt",
+                      "onlineType": "int",
+                      "partition": false,
+                      "primary": true,
+                      "type": "int"
+                    },
+                    {
+                      "defaultValue": null,
+                      "featureGroupId": 15,
+                      "hudiPrecombineKey": false,
+                      "name": "stringt",
+                      "onlineType": "varchar(1000)",
+                      "partition": false,
+                      "primary": false,
+                      "type": "string"
+                    }
+                  ],
+                  "onlineTopicName": "119_15_fg_test_1_onlinefs",
+                  "onlineEnabled": true,
+                  "timeTravelFormat": "HUDI"
+                },
+                "left_features": ["intt"],
+                "feature_store_name": "test_feature_store_name",
+                "feature_store_id": 67,
+                "left_feature_group_start_time": "test_left_feature_group_start_time",
+                "left_feature_group_end_time": "test_left_feature_group_end_time",
+                "joins": [],
+                "filter": null
+              },
+              "on": ["test_on"],
+              "left_on": ["test_left_on"],
+              "right_on": ["test_right_on"],
+              "join_type": "inner",
+              "prefix": "test_prefix"
+            }
+          ],
+          "filter": {
+            "condition": "test_condition",
+            "value": "test_value",
+            "feature": {
+              "defaultValue": null,
+              "featureGroupId": 15,
+              "hudiPrecombineKey": true,
+              "name": "intt",
+              "onlineType": "int",
+              "partition": false,
+              "primary": true,
+              "type": "int"
+            }
+          }
+        },
+        "featurestore_id": 5,
+        "id": 11,
+        "version": 1,
+        "description": "test_description",
+        "transformation_functions": [
+            {
+              "id" : 1,
+              "version": 2,
+              "featurestoreId": 11,
+              "hopsworksUdf":{
+                "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+                "name": "add_mean_fs",
+                "outputTypes":"double",
+                "transformationFeatures":"data"
+              }
+            },
+            {
+              "id" : 2,
+              "version": 1,
+              "featurestoreId": 11,
+              "hopsworksUdf":{
+                "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+                "name": "add_one_fs",
+                "outputTypes":"double",
+                "transformationFeatures":"col1"
+              }
+            }
+        ],
+        "features": [
+          {
+            "name": "intt",
+            "label": "t",
+            "featuregroup": {
+              "type": "featuregroupDTO",
+              "featurestoreId": 67,
+              "version": 1,
+              "name": "fg_test",
+              "id": 15,
+              "statisticsConfig": {
+                "enabled": true,
+                "histograms": false,
+                "correlations": false,
+                "exactUniqueness": false,
+                "columns": []
+              },
+              "onlineEnabled": false,
+              "deprecated": false
+            }
+          },
+          {
+            "name": "stringt",
+            "featurestoreId": 67,
+            "featuregroup": {
+              "type": "featuregroupDTO",
+              "featurestoreId": 67,
+              "version": 1,
+              "name": "fg_test",
+              "id": 15,
+              "statisticsConfig": {
+                "enabled": true,
+                "histograms": false,
+                "correlations": false,
+                "exactUniqueness": false,
+                "columns": []
+              },
+              "onlineEnabled": false,
+              "deprecated": false
+            }
+          }
+        ]
+    }
   }
 }

From f0e9540d6927a1e0381efbe8b4295c22a0db25dc Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 15:03:25 +0200
Subject: [PATCH 23/58] reformatting with ruff

---
 python/hsfs/engine/python.py            | 20 +++++++++++++++++---
 python/hsfs/engine/spark.py             | 12 ++++++++++--
 python/hsfs/training_dataset_feature.py |  1 +
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 6d213f7778..03daa581df 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -30,7 +30,17 @@
 from datetime import datetime, timezone
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, TYPE_CHECKING
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import avro
 import boto3
@@ -209,7 +219,6 @@ def _sql_offline(
         hive_config: Optional[Dict[str, Any]] = None,
         arrow_flight_config: Optional[Dict[str, Any]] = None,
     ) -> Union[pd.DataFrame, pl.DataFrame]:
-
         self._validate_dataframe_type(dataframe_type)
         if isinstance(sql_query, dict) and "query_string" in sql_query:
             result_df = util.run_with_loading_animation(
@@ -513,7 +522,12 @@ def show(
             sql_query, feature_store, online_conn, "default", read_options or {}
         ).head(n)
 
-    def read_vector_db(self, feature_group: "hsfs.feature_group.FeatureGroup", n: int =None, dataframe_type: str="default") -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]:
+    def read_vector_db(
+        self,
+        feature_group: "hsfs.feature_group.FeatureGroup",
+        n: int = None,
+        dataframe_type: str = "default",
+    ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[List[Any]]]:
         dataframe_type = dataframe_type.lower()
         self._validate_dataframe_type(dataframe_type)
 
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index f1f6fcb69a..48bce2e351 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -23,13 +23,14 @@
 import shutil
 import warnings
 from datetime import date, datetime, timezone
-from typing import Any, List, Optional, TypeVar, Union, TYPE_CHECKING, Dict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
 
 import avro
 import numpy as np
 import pandas as pd
 import tzlocal
 
+
 if TYPE_CHECKING:
     from hsfs.constructor.query import Query
     from hsfs.feature_view import FeatureView
@@ -158,7 +159,14 @@ def show(self, sql_query, feature_store, n, online_conn, read_options=None):
             sql_query, feature_store, online_conn, "default", read_options
         ).show(n)
 
-    def read_vector_db(self, feature_group: fg_mod.FeatureGroup, n: int =None, dataframe_type: str="default") -> Union[pd.DataFrame, np.ndarray, List[List[Any]], TypeVar("pyspark.sql.DataFrame")]:
+    def read_vector_db(
+        self,
+        feature_group: fg_mod.FeatureGroup,
+        n: int = None,
+        dataframe_type: str = "default",
+    ) -> Union[
+        pd.DataFrame, np.ndarray, List[List[Any]], TypeVar("pyspark.sql.DataFrame")
+    ]:
         results = VectorDbClient.read_feature_group(feature_group, n)
         feature_names = [f.name for f in feature_group.features]
         dataframe_type = dataframe_type.lower()
diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py
index c444e833c7..a06637abe2 100644
--- a/python/hsfs/training_dataset_feature.py
+++ b/python/hsfs/training_dataset_feature.py
@@ -20,6 +20,7 @@
 from hsfs import feature_group as feature_group_mod
 from hsfs import util
 
+
 class TrainingDatasetFeature:
     def __init__(
         self,

From 60726423f7e18352008db7ef1467e2643954c088 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 15:43:17 +0200
Subject: [PATCH 24/58] fixing vector server

---
 .../core/transformation_function_engine.py    | 43 +++++++++++++++++++
 python/hsfs/core/vector_server.py             | 10 ++---
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 0384e05ac9..773380a113 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -139,6 +139,49 @@ def compute_transformation_fn_statistics(
             feature_view_obj=feature_view_obj,
         )
 
+    @staticmethod
+    def get_ready_to_use_transformation_fns(
+        feature_view: FeatureView,
+        training_dataset_version: Optional[int] = None,
+    ) -> List[TransformationFunction]:
+        # get attached transformation functions
+        transformation_functions = (
+            feature_view._feature_view_engine.get_attached_transformation_fn()
+        )
+        is_stat_required = any(
+            [tf.hopsworks_udf.statistics_required for tf in transformation_functions]
+        )
+        if not is_stat_required:
+            td_tffn_stats = None
+        else:
+            # if there are any transformation functions that require statistics get related statistics and
+            # populate with relevant arguments
+            # there should be only one statistics object with before_transformation=true
+            if training_dataset_version is None:
+                raise ValueError(
+                    "Training data version is required for transformation. Call `feature_view.init_serving(version)` "
+                    "or `feature_view.init_batch_scoring(version)` to pass the training dataset version."
+                    "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`."
+                )
+            td_tffn_stats = feature_view._statistics_engine.get(
+                feature_view,
+                before_transformation=True,
+                training_dataset_version=training_dataset_version,
+            )
+
+        if is_stat_required and td_tffn_stats is None:
+            raise ValueError(
+                "No statistics available for initializing transformation functions."
+                + "Training data can be created by `feature_view.create_training_data` or `feature_view.training_data`."
+            )
+
+        if is_stat_required:
+            for transformation_function in transformation_functions:
+                transformation_function.hopsworks_udf.transformation_statistics = (
+                    td_tffn_stats.feature_descriptive_statistics
+                )
+        return transformation_functions
+
     @staticmethod
     def compute_and_set_feature_statistics(
         training_dataset: training_dataset.TrainingDataset,
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 1ef1df0854..49892db1c4 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -181,14 +181,12 @@ def init_batch_scoring(
 
     def init_transformation(
         self,
-        entity: Union[feature_view.FeatureView, training_dataset.TrainingDataset],
+        entity: Union[feature_view.FeatureView],
     ):
         # attach transformation functions
-        self._transformation_functions = (
-            self.transformation_function_engine.get_ready_to_use_transformation_fns(
-                entity,
-                self._training_dataset_version,
-            )
+        self._transformation_functions = transformation_function_engine.TransformationFunctionEngine.get_ready_to_use_transformation_fns(
+            entity,
+            self._training_dataset_version,
         )
 
     def setup_sql_client(

From f46f0b71102b87caf5f1b78fa88cb97a16673af7 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 13 May 2024 16:29:15 +0200
Subject: [PATCH 25/58] fixing docs

---
 python/hsfs/core/feature_view_engine.py       | 20 ++---
 .../core/transformation_function_engine.py    | 77 ++++++++++---------
 python/hsfs/core/vector_server.py             | 21 +++--
 python/hsfs/engine/python.py                  | 22 ++----
 python/hsfs/engine/spark.py                   | 49 ++++++------
 python/hsfs/feature_store.py                  |  7 +-
 python/hsfs/feature_view.py                   |  5 +-
 7 files changed, 96 insertions(+), 105 deletions(-)

diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index 19ea348b97..070be9b821 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -17,7 +17,7 @@
 
 import datetime
 import warnings
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import List, Optional, Union
 
 from hsfs import (
     client,
@@ -25,6 +25,7 @@
     feature_group,
     feature_view,
     training_dataset_feature,
+    transformation_function,
     util,
 )
 from hsfs.client import exceptions
@@ -41,11 +42,6 @@
 from hsfs.training_dataset_split import TrainingDatasetSplit
 
 
-if TYPE_CHECKING:
-    from hsfs.feature_view import FeatureView
-    from hsfs.transformation_function import TransformationFunction
-
-
 class FeatureViewEngine:
     ENTITY_TYPE = "featureview"
     _TRAINING_DATA_API_PATH = "trainingdatasets"
@@ -68,7 +64,9 @@ def __init__(self, feature_store_id):
         )
         self._query_constructor_api = query_constructor_api.QueryConstructorApi()
 
-    def save(self, feature_view_obj: FeatureView) -> FeatureView:
+    def save(
+        self, feature_view_obj: feature_view.FeatureView
+    ) -> feature_view.FeatureView:
         """
         Save a feature view to the backend.
 
@@ -135,7 +133,9 @@ def save(self, feature_view_obj: FeatureView) -> FeatureView:
         )
         return updated_fv
 
-    def update(self, feature_view_obj: FeatureView) -> FeatureView:
+    def update(
+        self, feature_view_obj: feature_view.FeatureView
+    ) -> feature_view.FeatureView:
         """
         Update the feature view object saved in the backend
 
@@ -150,7 +150,7 @@ def update(self, feature_view_obj: FeatureView) -> FeatureView:
 
     def get(
         self, name: str, version: int = None
-    ) -> Union[FeatureView, List[FeatureView]]:
+    ) -> Union[feature_view.FeatureView, List[feature_view.FeatureView]]:
         """
         Get a feature view form the backend using name or using name and version.
 
@@ -267,7 +267,7 @@ def get_batch_query_string(
 
     def get_attached_transformation_fn(
         self, name: str, version: int
-    ) -> List[TransformationFunction]:
+    ) -> List[transformation_function.TransformationFunction]:
         """
         Get transformation functions attached to a feature view form the backend
 
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 773380a113..128c98e6cb 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -15,21 +15,14 @@
 #
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
+from typing import Dict, List, Optional, Set, TypeVar, Union
 
-from hsfs import training_dataset
+import pandas as pd
+import polars as pl
+from hsfs import feature_view, statistics, training_dataset, transformation_function
 from hsfs.core import transformation_function_api
 
 
-if TYPE_CHECKING:
-    import pandas as pd
-    import polars as pl
-    import pyspark.sql as ps
-    from hsfs.feature_view import FeatureView
-    from hsfs.statistics import Statistics
-    from hsfs.transformation_function import TransformationFunction
-
-
 class TransformationFunctionEngine:
     BUILTIN_FN_NAMES = [
         "min_max_scaler",
@@ -50,13 +43,13 @@ def __init__(self, feature_store_id: int):
         )
 
     def save(
-        self, transformation_fn_instance: TransformationFunction
-    ) -> TransformationFunction:
+        self, transformation_fn_instance: transformation_function.TransformationFunction
+    ) -> transformation_function.TransformationFunction:
         """
         Save a transformation function into the feature store.
 
         # Argument
-            transformation_fn_instance `TransformationFunction`: The transformation function to be saved into the feature store.
+            transformation_fn_instance `transformation_function.TransformationFunction`: The transformation function to be saved into the feature store.
         """
         self._transformation_function_api.register_transformation_fn(
             transformation_fn_instance
@@ -64,7 +57,10 @@ def save(
 
     def get_transformation_fn(
         self, name: str, version: Optional[int] = None
-    ) -> Union[TransformationFunction, List[TransformationFunction]]:
+    ) -> Union[
+        transformation_function.TransformationFunction,
+        List[transformation_function.TransformationFunction],
+    ]:
         """
         Retrieve a transformation function from the feature store.
 
@@ -75,7 +71,7 @@ def get_transformation_fn(
             name ` Optional[str]`: The name of the transformation function to be retrieved.
             version `Optional[int]`: The version of the transformation function to be retrieved.
         # Returns
-            `Union[TransformationFunction, List[TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided.
+            `Union[transformation_function.TransformationFunction, List[transformation_function.TransformationFunction]]` : A transformation function if name and version is provided. A list of transformation functions if only name is provided.
         """
 
         transformation_fn_instances = (
@@ -83,12 +79,14 @@ def get_transformation_fn(
         )
         return transformation_fn_instances
 
-    def get_transformation_fns(self) -> List[TransformationFunction]:
+    def get_transformation_fns(
+        self,
+    ) -> List[transformation_function.TransformationFunction]:
         """
         Get all the transformation functions in the feature store
 
         # Returns
-            `List[TransformationFunction]` : A list of transformation functions.
+            `List[transformation_function.TransformationFunction]` : A list of transformation functions.
         """
         transformation_fn_instances = (
             self._transformation_function_api.get_transformation_fn(
@@ -102,12 +100,15 @@ def get_transformation_fns(self) -> List[TransformationFunction]:
             transformation_fns.append(transformation_fn_instance)
         return transformation_fns
 
-    def delete(self, transformation_function_instance: TransformationFunction) -> None:
+    def delete(
+        self,
+        transformation_function_instance: transformation_function.TransformationFunction,
+    ) -> None:
         """
         Delete a transformation function from the feature store.
 
         # Arguments
-            transformation_function_instance `TransformationFunction`: The transformation function to be removed from the feature store.
+            transformation_function_instance `transformation_function.TransformationFunction`: The transformation function to be removed from the feature store.
         """
         self._transformation_function_api.delete(transformation_function_instance)
 
@@ -116,9 +117,11 @@ def compute_transformation_fn_statistics(
         training_dataset_obj: training_dataset.TrainingDataset,
         statistics_features: List[str],
         label_encoder_features: List[str],
-        feature_dataframe: Union[pd.DataFrame, pl.DataFrame, ps.DataFrame],
-        feature_view_obj: FeatureView,
-    ) -> Statistics:
+        feature_dataframe: Union[
+            pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")
+        ],
+        feature_view_obj: feature_view.FeatureView,
+    ) -> statistics.Statistics:
         """
         Compute the statistics required for a training dataset object.
 
@@ -141,9 +144,9 @@ def compute_transformation_fn_statistics(
 
     @staticmethod
     def get_ready_to_use_transformation_fns(
-        feature_view: FeatureView,
+        feature_view: feature_view.FeatureView,
         training_dataset_version: Optional[int] = None,
-    ) -> List[TransformationFunction]:
+    ) -> List[transformation_function.TransformationFunction]:
         # get attached transformation functions
         transformation_functions = (
             feature_view._feature_view_engine.get_attached_transformation_fn()
@@ -185,10 +188,12 @@ def get_ready_to_use_transformation_fns(
     @staticmethod
     def compute_and_set_feature_statistics(
         training_dataset: training_dataset.TrainingDataset,
-        feature_view_obj: FeatureView,
+        feature_view_obj: feature_view.FeatureView,
         dataset: Union[
-            Dict[str, Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]],
-            Union[pd.DataFrame, pl.DataFrame, ps.DataFrame],
+            Dict[
+                str, Union[pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")]
+            ],
+            Union[pd.DataFrame, pl.DataFrame, TypeVar("pyspark.sql.DataFrame")],
         ],
     ) -> None:
         """
@@ -204,10 +209,8 @@ def compute_and_set_feature_statistics(
         statistics_features: Set[str] = set()
 
         # Finding the features for which statistics is required
-        for transformation_function in feature_view_obj.transformation_functions:
-            statistics_features.update(
-                transformation_function.hopsworks_udf.statistics_features
-            )
+        for tf in feature_view_obj.transformation_functions:
+            statistics_features.update(tf.hopsworks_udf.statistics_features)
         if statistics_features:
             # compute statistics on training data
             if training_dataset.splits:
@@ -233,15 +236,15 @@ def compute_and_set_feature_statistics(
                 )
 
             # Set statistics computed in the hopsworks UDF
-            for transformation_function in feature_view_obj.transformation_functions:
-                transformation_function.hopsworks_udf.transformation_statistics = (
+            for tf in feature_view_obj.transformation_functions:
+                tf.hopsworks_udf.transformation_statistics = (
                     stats.feature_descriptive_statistics
                 )
 
     @staticmethod
     def get_and_set_feature_statistics(
         training_dataset: training_dataset.TrainingDataset,
-        feature_view_obj: FeatureView,
+        feature_view_obj: feature_view.FeatureView,
         training_dataset_version: int = None,
     ) -> None:
         """
@@ -277,7 +280,7 @@ def get_and_set_feature_statistics(
                     "No statistics available for initializing transformation functions."
                 )
 
-            for transformation_function in feature_view_obj.transformation_functions:
-                transformation_function.hopsworks_udf.transformation_statistics = (
+            for tf in feature_view_obj.transformation_functions:
+                tf.hopsworks_udf.transformation_statistics = (
                     td_tffn_stats.feature_descriptive_statistics
                 )
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 49892db1c4..94468f1dde 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -43,13 +43,13 @@
     online_store_rest_client_engine,
     online_store_sql_engine,
     transformation_function_engine as tf_engine_mod,
-    transformation_functions
+    transformation_function
 )
 
+
 HAS_FASTAVRO = False
 try:
     from fastavro import schemaless_reader
-
     HAS_FASTAVRO = True
 except ImportError:
     from avro.io import BinaryDecoder
@@ -106,7 +106,7 @@ def __init__(
         self._transformation_function_engine = (
             tf_engine_mod.TransformationFunctionEngine(feature_store_id)
         )
-        self._transformation_functions: List[transformation_functions.TransformationFunction] = []
+        self._transformation_functions: List[transformation_function.TransformationFunction] = []
 
         self._sql_client = None
 
@@ -184,7 +184,7 @@ def init_transformation(
         entity: Union[feature_view.FeatureView],
     ):
         # attach transformation functions
-        self._transformation_functions = transformation_function_engine.TransformationFunctionEngine.get_ready_to_use_transformation_fns(
+        self._transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns(
             entity,
             self._training_dataset_version,
         )
@@ -564,7 +564,6 @@ def get_inference_helpers(
             batch_results, batch=True, inference_helper=True, return_type=return_type
         )
     
-
     def which_client_and_ensure_initialised(
         self, force_rest_client: bool, force_sql_client: bool
     ) -> str:
@@ -628,14 +627,12 @@ def _set_default_client(
 
     def apply_transformation(self, row_dict: dict):
         _logger.debug("Applying transformation functions to : %s", matching_keys)
-        for transformation_function in self.transformation_functions:
+        for tf in self.transformation_functions:
             features = [
                 pd.Series(row_dict[feature])
-                for feature in transformation_function.hopsworks_udf.transformation_features
+                for feature in tf.hopsworks_udf.transformation_features
             ]
-            transformed_result = transformation_function.hopsworks_udf.get_udf()(
-                *features
-            )
+            transformed_result = tf.hopsworks_udf.get_udf()(*features)
             if isinstance(transformed_result, pd.Series):
                 row_dict[transformed_result.name] = transformed_result.values[0]
             else:
@@ -678,6 +675,7 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
             for f in self._features
             if f.is_complex()
         }
+
         if len(complex_feature_schemas) == 0:
             return {}
         else:
@@ -869,7 +867,6 @@ def identify_missing_features_pre_fetch(
                 passed_feature_names = passed_feature_names.union(
                     vector_db_features.keys()
                 )
-
             neither_fetched_nor_passed = fetched_features.difference(
                 passed_feature_names
             )
@@ -912,7 +909,7 @@ def build_per_serving_key_features(
                 ]
             )
         return per_serving_key_features
-
+    
     @property
     def sql_client(
         self,
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 03daa581df..e6d55a8238 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -31,7 +31,6 @@
 from io import BytesIO
 from pathlib import Path
 from typing import (
-    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -58,6 +57,7 @@
     feature,
     feature_store,
     feature_view,
+    transformation_function,
     util,
 )
 from hsfs import storage_connector as sc
@@ -90,10 +90,6 @@
 from tqdm.auto import tqdm
 
 
-if TYPE_CHECKING:
-    from hsfs.transformation_function import TransformationFunction
-
-
 # Disable pyhive INFO logging
 logging.getLogger("pyhive").setLevel(logging.WARNING)
 
@@ -1296,14 +1292,14 @@ def add_file(self, file: Optional[str]) -> Optional[str]:
 
     def _apply_transformation_function(
         self,
-        transformation_functions: List[TransformationFunction],
+        transformation_functions: List[transformation_function.TransformationFunction],
         dataset: Union[pd.DataFrame, pl.DataFrame],
     ) -> Union[pd.DataFrame, pl.DataFrame]:
         """
         Apply transformation function to the dataframe.
 
         # Arguments
-            transformation_functions `List[TransformationFunction]` : List of transformation functions.
+            transformation_functions `List[transformation_function.TransformationFunction]` : List of transformation functions.
             dataset `Union[pd.DataFrame, pl.DataFrame]`: A pandas or polars dataframe.
         # Returns
             `DataFrame`: A pandas dataframe with the transformed data.
@@ -1323,8 +1319,8 @@ def _apply_transformation_function(
             else:
                 dataset = dataset.to_pandas(use_pyarrow_extension_array=False)
 
-        for transformation_function in transformation_functions:
-            hopsworks_udf = transformation_function.hopsworks_udf
+        for tf in transformation_functions:
+            hopsworks_udf = tf.hopsworks_udf
             missing_features = set(hopsworks_udf.transformation_features) - set(
                 dataset.columns
             )
@@ -1333,17 +1329,15 @@ def _apply_transformation_function(
                     f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
 
-            transformed_features.update(
-                transformation_function.hopsworks_udf.transformation_features
-            )
+            transformed_features.update(tf.hopsworks_udf.transformation_features)
             dataset = pd.concat(
                 [
                     dataset,
-                    transformation_function.hopsworks_udf.get_udf()(
+                    tf.hopsworks_udf.get_udf()(
                         *(
                             [
                                 dataset[feature]
-                                for feature in transformation_function.hopsworks_udf.transformation_features
+                                for feature in tf.hopsworks_udf.transformation_features
                             ]
                         )
                     ),
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index 48bce2e351..c462efa641 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -23,19 +23,13 @@
 import shutil
 import warnings
 from datetime import date, datetime, timezone
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
+from typing import Any, Dict, List, Optional, TypeVar, Union
 
 import avro
 import numpy as np
 import pandas as pd
 import tzlocal
-
-
-if TYPE_CHECKING:
-    from hsfs.constructor.query import Query
-    from hsfs.feature_view import FeatureView
-    from hsfs.training_dataset import TrainingDataset
-    from hsfs.transformation_function import TransformationFunction
+from hsfs.constructor import query
 
 # in case importing in %%local
 from hsfs.core.vector_db_client import VectorDbClient
@@ -89,11 +83,18 @@ def iteritems(self):
     DataContextConfig,
     InMemoryStoreBackendDefaults,
 )
-from hsfs import client, feature, training_dataset_feature, util
+from hsfs import (
+    client,
+    feature,
+    feature_view,
+    training_dataset,
+    training_dataset_feature,
+    transformation_function,
+    util,
+)
 from hsfs import feature_group as fg_mod
 from hsfs.client import hopsworks
 from hsfs.client.exceptions import FeatureStoreException
-from hsfs.constructor import query
 from hsfs.core import (
     dataset_api,
     delta_engine,
@@ -556,9 +557,9 @@ def _online_fg_to_avro(self, feature_group, dataframe):
 
     def get_training_data(
         self,
-        training_dataset: TrainingDataset,
-        feature_view_obj: FeatureView,
-        query_obj: Query,
+        training_dataset: training_dataset.TrainingDataset,
+        feature_view_obj: feature_view.FeatureView,
+        query_obj: query.Query,
         read_options: Dict[str, Any],
         dataframe_type: str,
         training_dataset_version: int = None,
@@ -607,12 +608,12 @@ def drop_columns(self, df, drop_cols):
 
     def write_training_dataset(
         self,
-        training_dataset: TrainingDataset,
-        query_obj: Query,
+        training_dataset: training_dataset.TrainingDataset,
+        query_obj: query.Query,
         user_write_options: Dict[str, Any],
         save_mode: str,
         read_options: Dict[str, Any] = None,
-        feature_view_obj: FeatureView = None,
+        feature_view_obj: feature_view.FeatureView = None,
         to_df: bool = False,
         training_dataset_version: Optional[int] = None,
     ):
@@ -844,7 +845,9 @@ def _write_training_dataset_splits(
         write_options,
         save_mode,
         to_df=False,
-        transformation_functions: List[TransformationFunction] = None,
+        transformation_functions: List[
+            transformation_function.TransformationFunction
+        ] = None,
     ):
         for split_name, feature_dataframe in feature_dataframes.items():
             split_path = training_dataset.location + "/" + str(split_name)
@@ -1226,7 +1229,9 @@ def add_cols_to_delta_table(self, feature_group, new_features):
         ).save(feature_group.location)
 
     def _apply_transformation_function(
-        self, transformation_functions: List[TransformationFunction], dataset: DataFrame
+        self,
+        transformation_functions: List[transformation_function.TransformationFunction],
+        dataset: DataFrame,
     ):
         """
         Apply transformation function to the dataframe.
@@ -1244,8 +1249,8 @@ def _apply_transformation_function(
         transformation_features = []
         output_col_names = []
         explode_name = []
-        for transformation_function in transformation_functions:
-            hopsworks_udf = transformation_function.hopsworks_udf
+        for tf in transformation_functions:
+            hopsworks_udf = tf.hopsworks_udf
             missing_features = set(hopsworks_udf.transformation_features) - set(
                 dataset.columns
             )
@@ -1255,9 +1260,7 @@ def _apply_transformation_function(
                     f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
 
-            transformed_features.update(
-                transformation_function.hopsworks_udf.transformation_features
-            )
+            transformed_features.update(tf.hopsworks_udf.transformation_features)
 
             pandas_udf = hopsworks_udf.get_udf()
             output_col_name = hopsworks_udf.output_column_names[0]
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 41d1a754ff..848252cb64 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -18,7 +18,7 @@
 
 import datetime
 import warnings
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeVar, Union
+from typing import Any, Dict, List, Optional, TypeVar, Union
 
 import great_expectations as ge
 import humps
@@ -48,14 +48,11 @@
 )
 from hsfs.decorators import typechecked
 from hsfs.embedding import EmbeddingIndex
+from hsfs.hopsworks_udf import HopsworksUdf
 from hsfs.statistics_config import StatisticsConfig
 from hsfs.transformation_function import TransformationFunction
 
 
-if TYPE_CHECKING:
-    from hsfs.hopsworks_udf import HopsworksUdf
-
-
 @typechecked
 class FeatureStore:
     DEFAULT_VERSION = 1
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 5b90fabfc2..ad53317d75 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -54,16 +54,13 @@
 from hsfs.core.vector_db_client import VectorDbClient
 from hsfs.decorators import typechecked
 from hsfs.feature import Feature
+from hsfs.hopsworks_udf import HopsworksUdf
 from hsfs.statistics import Statistics
 from hsfs.statistics_config import StatisticsConfig
 from hsfs.training_dataset_split import TrainingDatasetSplit
 from hsfs.transformation_function import TransformationFunction
 
 
-if TYPE_CHECKING:
-    from hsfs.hopsworks_udf import HopsworksUdf
-
-
 _logger = logging.getLogger(__name__)
 
 TrainingDatasetDataFrameTypes = Union[

From 07348d52bfaa664439714b2cd9a72b6a90208462 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 14 May 2024 08:26:59 +0200
Subject: [PATCH 26/58] fixing vector server

---
 .../core/transformation_function_engine.py    | 11 ++++++++++-
 python/hsfs/core/vector_server.py             | 19 +++++++++++++++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 128c98e6cb..304b0fcabb 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -149,8 +149,17 @@ def get_ready_to_use_transformation_fns(
     ) -> List[transformation_function.TransformationFunction]:
         # get attached transformation functions
         transformation_functions = (
-            feature_view._feature_view_engine.get_attached_transformation_fn()
+            feature_view._feature_view_engine.get_attached_transformation_fn(
+                feature_view.name, feature_view.version
+            )
         )
+
+        transformation_functions = (
+            [transformation_functions]
+            if not isinstance(transformation_functions, list)
+            else transformation_functions
+        )
+
         is_stat_required = any(
             [tf.hopsworks_udf.statistics_required for tf in transformation_functions]
         )
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 94468f1dde..ed168d6295 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -429,7 +429,7 @@ def assemble_feature_vector(
 
         _logger.debug("Assembled and transformed dict feature vector: %s", result_dict)
 
-        return [result_dict.get(fname, None) for fname in self.feature_vector_col_name]
+        return [result_dict.get(fname, None) for fname in self.transformed_feature_vector_col_name]
 
     def handle_feature_vector_return_type(
         self,
@@ -909,7 +909,7 @@ def build_per_serving_key_features(
                 ]
             )
         return per_serving_key_features
-    
+
     @property
     def sql_client(
         self,
@@ -1062,9 +1062,20 @@ def default_client(self, default_client: Literal["rest", "sql"]):
 
     def transformed_feature_vector_col_name(self):
         if self._transformed_feature_vector_col_name is None:
-            self._transformed_feature_vector_col_name = self._feature_vector_col_name
+            transformation_features = []
+            output_column_names = []
             for transformation_function in self._transformation_functions:
-                self._transformed_feature_vector_col_name += (
+                transformation_features += (
                     transformation_function.hopsworks_udf.transformation_features
                 )
+                output_column_names += (
+                    transformation_function.hopsworks_udf.output_column_names
+                )
+
+            self._transformed_feature_vector_col_name = [
+                feature
+                for feature in self._feature_vector_col_name
+                if feature not in transformation_features
+            ]
+            self._transformed_feature_vector_col_name.extend(output_column_names)
         return self._transformed_feature_vector_col_name
\ No newline at end of file

From 41a02acd075cec78e0ba2f3d10735b46b2e9d544 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 14 May 2024 09:31:58 +0200
Subject: [PATCH 27/58] fixing building in transformations

---
 .../hsfs/core/transformation_function_engine.py | 10 ++++++++--
 python/hsfs/hopsworks_udf.py                    | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 304b0fcabb..ddbaebe8e2 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -216,10 +216,16 @@ def compute_and_set_feature_statistics(
             dataset `Union[Dict[str,  Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]],  Union[pd.DataFrame, pl.DataFrame, ps.DataFrame]]`: A dataframe that conqtains the training data or a dictionary that contains both the training and test data.
         """
         statistics_features: Set[str] = set()
+        label_encoder_features: Set[str] = set()
 
         # Finding the features for which statistics is required
         for tf in feature_view_obj.transformation_functions:
             statistics_features.update(tf.hopsworks_udf.statistics_features)
+            if (
+                tf.hopsworks_udf.function_name == "label_encoder"
+                or tf.hopsworks_udf.function_name == "one_hot_encoder"
+            ):
+                label_encoder_features.update(tf.hopsworks_udf.statistics_features)
         if statistics_features:
             # compute statistics on training data
             if training_dataset.splits:
@@ -228,7 +234,7 @@ def compute_and_set_feature_statistics(
                     TransformationFunctionEngine.compute_transformation_fn_statistics(
                         training_dataset,
                         list(statistics_features),
-                        [],
+                        list(label_encoder_features),
                         dataset.get(training_dataset.train_split),
                         feature_view_obj,
                     )
@@ -238,7 +244,7 @@ def compute_and_set_feature_statistics(
                     TransformationFunctionEngine.compute_transformation_fn_statistics(
                         training_dataset,
                         list(statistics_features),
-                        [],
+                        list(label_encoder_features),
                         dataset,
                         feature_view_obj,
                     )
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 9b3b332812..049818d234 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -480,6 +480,19 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
         udf.output_column_names = udf._get_output_column_names()
         return udf
 
+    def update_return_type_one_hot(self):
+        self._output_types = [
+            self._output_types[0]
+            for _ in range(
+                len(
+                    self.transformation_statistics[
+                        "statistics_feature"
+                    ].extended_statistics["unique_values"]
+                )
+            )
+        ]
+        self.output_column_names = self._get_output_column_names()
+
     def get_udf(self) -> Callable:
         """
         Function that checks the current engine type and returns the appropriate UDF.
@@ -490,6 +503,10 @@ def get_udf(self) -> Callable:
         # Returns
             `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF.
         """
+        # Update the number of outputs for one hot encoder to match the number of unique values for the feature
+        if self.function_name == "one_hot_encoder":
+            self.update_return_type_one_hot()
+
         if engine.get_type() in ["hive", "python", "training"]:
             return self.hopsworksUdf_wrapper()
         else:

From 221560613f6afb2fb405b6d827fa5c2564dcfe11 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 14 May 2024 16:33:34 +0200
Subject: [PATCH 28/58] correcting get feature vector

---
 python/hsfs/core/vector_server.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index ed168d6295..8e25e18632 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -637,9 +637,7 @@ def apply_transformation(self, row_dict: dict):
                 row_dict[transformed_result.name] = transformed_result.values[0]
             else:
                 for col in transformed_result:
-                    row_dict[transformed_result.name] = transformed_result[col].values[
-                        0
-                    ]
+                    row_dict[col] = transformed_result[col].values[0]
         return row_dict
 
     def apply_return_value_handlers(

From e1d7abe504f53f8c8b54b6ba3ed3b40558163deb Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 16 May 2024 11:13:34 +0200
Subject: [PATCH 29/58] adding missed changes for build in transformations

---
 python/hsfs/builtin_transformations.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
index d17ae6f1fa..35a26d137e 100644
--- a/python/hsfs/builtin_transformations.py
+++ b/python/hsfs/builtin_transformations.py
@@ -42,7 +42,7 @@ def robust_scaler(
     )
 
 
-# @hopsworks_udf(int)
+@hopsworks_udf(int)
 def label_encoder(
     feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
 ) -> pd.Series:
@@ -53,15 +53,16 @@ def label_encoder(
     return pd.Series([value_to_index[data] for data in feature])
 
 
+@hopsworks_udf(bool)
 def one_hot_encoder(
     feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
 ) -> pd.Series:
     unique_data = [
         value for value in statistics_feature.extended_statistics["unique_values"]
     ]
-    print(statistics_feature.extended_statistics["unique_values"])
     one_hot = pd.get_dummies(feature, dtype="bool")
     for data in unique_data:
         if data not in one_hot:
             one_hot[data] = False
-    return one_hot
+    # Sorting by columns so as to maintain consistency in column order.
+    return one_hot.reindex(sorted(one_hot.columns), axis=1)

From 2d0bca3eba7ef9e344a962a030a17065c907269a Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 17 May 2024 12:56:33 +0200
Subject: [PATCH 30/58] shallow copying scope dictonary to not overwrite
 statistics variable for different udf's having same statistics parameter name

---
 python/hsfs/hopsworks_udf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 049818d234..0091e50481 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -433,8 +433,11 @@ def renaming_wrapper(*args):
         df = convert_timezone(df)
     return df"""
             )
+
         # injecting variables into scope used to execute wrapper function.
-        scope = __import__("__main__").__dict__
+
+        # Shallow copy of scope performed because updating statistics argument of scope must not affect other instances.
+        scope = __import__("__main__").__dict__.copy()
         if self.transformation_statistics is not None:
             scope.update(self.transformation_statistics)
         scope.update({"_output_col_names": self.output_column_names})

From 37f96fa9f7b3947f9909acd214852ca2245c5008 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 20 May 2024 15:38:11 +0200
Subject: [PATCH 31/58] adding deep copy to create multiple transfromation
 functions with different features

---
 python/hsfs/transformation_function.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index b6ef060cb9..ce33a2b8d0 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -14,6 +14,7 @@
 #
 from __future__ import annotations
 
+import copy
 import json
 from typing import Any, Dict, List, Optional, Union
 
@@ -129,8 +130,10 @@ def __call__(self, *features: List[str]) -> TransformationFunction:
         # Raises
             `FeatureStoreException: If the provided number of features do not match the number of arguments in the defined UDF or if the provided feature names are not strings.
         """
-        self._hopsworks_udf = self._hopsworks_udf(*features)
-        return self
+        # Deep copy so that the same transformation function can be used to create multiple new transformation function with different features.
+        transformation = copy.deepcopy(self)
+        transformation._hopsworks_udf = transformation._hopsworks_udf(*features)
+        return transformation
 
     @classmethod
     def from_response_json(

From 37a8b2388b765f8a504ed11609d7612ddb24743e Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 21 May 2024 02:02:41 +0200
Subject: [PATCH 32/58] sorting transformation function to maintain consistent
 order

---
 python/hsfs/feature_view.py            | 22 ++++++++++++++++++++++
 python/hsfs/transformation_function.py |  5 +++++
 2 files changed, 27 insertions(+)

diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index ad53317d75..a8e51c3b69 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -135,6 +135,11 @@ def __init__(
             else []
         )
 
+        if self._transformation_functions:
+            self._transformation_functions = FeatureView._sort_transformation_functions(
+                self._transformation_functions
+            )
+
         self._features = []
         self._feature_view_engine: feature_view_engine.FeatureViewEngine = (
             feature_view_engine.FeatureViewEngine(featurestore_id)
@@ -378,6 +383,23 @@ def init_serving(
                 self.query, serving_keys=self._serving_keys
             )
 
+    @staticmethod
+    def _sort_transformation_functions(
+        transformation_functions: List[TransformationFunction],
+    ) -> List[TransformationFunction]:
+        """
+        Function that sorts transformation functions in the order of the output column names.
+
+        The list of transformation functions are sorted based on the output columns names to maintain consistent ordering.
+
+        # Arguments
+            transformation_functions:  `List[TransformationFunction]`. List of transformation functions to be sorted
+
+        # Returns
+            `List[TransformationFunction]`: List of transformation functions to be sorted
+        """
+        return sorted(transformation_functions, key=lambda x: x.output_column_names[0])
+
     def init_batch_scoring(
         self,
         training_dataset_version: Optional[int] = None,
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index ce33a2b8d0..3267b4d14a 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -228,3 +228,8 @@ def version(self, version: int) -> None:
     def hopsworks_udf(self) -> HopsworksUdf:
         """Meta data class for the user defined transformation function."""
         return self._hopsworks_udf
+
+    @property
+    def output_column_names(self) -> List[str]:
+        """Output column names of transformation functions"""
+        return self._hopsworks_udf._output_column_names

From eb77d70a701d1f243d7176ca28fd0fa723d84d58 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 21 May 2024 11:04:44 +0200
Subject: [PATCH 33/58] sorting transformation functions in transformation
 function engine to mainatin same order

---
 python/hsfs/core/transformation_function_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index ddbaebe8e2..ec5de0810b 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -192,7 +192,7 @@ def get_ready_to_use_transformation_fns(
                 transformation_function.hopsworks_udf.transformation_statistics = (
                     td_tffn_stats.feature_descriptive_statistics
                 )
-        return transformation_functions
+        return feature_view._sort_transformation_functions(transformation_functions)
 
     @staticmethod
     def compute_and_set_feature_statistics(

From 68c95aa78c5ffc3ec78e8ad90381290623979a30 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 21 May 2024 15:03:01 +0200
Subject: [PATCH 34/58] using feature view transformation functions

---
 python/hsfs/engine/python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index e6d55a8238..e88a530c90 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -1022,7 +1022,7 @@ def _prepare_transform_split_df(
         # and the apply them
         for split_name in result_dfs:
             result_dfs[split_name] = self._apply_transformation_function(
-                training_dataset_obj.transformation_functions,
+                feature_view_obj.transformation_functions,
                 result_dfs.get(split_name),
             )
 

From 88bff75417427e48491a14228dae92838eddc2cd Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 23 May 2024 06:23:17 +0200
Subject: [PATCH 35/58] addressing review comments

---
 python/hsfs/__init__.py                |  9 -------
 python/hsfs/builtin_transformations.py |  5 +++-
 python/hsfs/core/vector_server.py      |  4 ++-
 python/hsfs/engine/python.py           | 34 +++++++++++++++++++-------
 python/hsfs/feature_store.py           |  2 +-
 python/hsfs/hopsworks_udf.py           | 17 +++++++------
 python/tests/engine/test_python.py     |  9 ++++---
 7 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/python/hsfs/__init__.py b/python/hsfs/__init__.py
index 82d368d243..31efe17c56 100644
--- a/python/hsfs/__init__.py
+++ b/python/hsfs/__init__.py
@@ -19,17 +19,8 @@
 import warnings
 
 import nest_asyncio
-from packaging.version import Version
 
 
-try:
-    import pandas as pd
-
-    if Version(pd.__version__) > Version("2.0"):
-        os.environ["USE_PYARROW_EXTENSION"] = "1"
-except ImportError:
-    pass  # Empty except block because environment variable "USE_PYARROW_EXTENSION" need not be set if pyarrow cannot be imported or if pandas version is less than 2.0
-
 # Setting polars skip cpu flag to suppress CPU false positive warning messages printed while importing hsfs
 os.environ["POLARS_SKIP_CPU_CHECK"] = "1"
 
diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
index 35a26d137e..d17126bd44 100644
--- a/python/hsfs/builtin_transformations.py
+++ b/python/hsfs/builtin_transformations.py
@@ -14,6 +14,7 @@
 #   limitations under the License.
 #
 
+import numpy as np
 import pandas as pd
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 from hsfs.hopsworks_udf import hopsworks_udf
@@ -50,7 +51,9 @@ def label_encoder(
         value for value in statistics_feature.extended_statistics["unique_values"]
     ]
     value_to_index = {value: index for index, value in enumerate(unique_data)}
-    return pd.Series([value_to_index[data] for data in feature])
+    return pd.Series(
+        [value_to_index[data] if not pd.isna(data) else np.nan for data in feature]
+    )
 
 
 @hopsworks_udf(bool)
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 8e25e18632..3a90387390 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -632,7 +632,9 @@ def apply_transformation(self, row_dict: dict):
                 pd.Series(row_dict[feature])
                 for feature in tf.hopsworks_udf.transformation_features
             ]
-            transformed_result = tf.hopsworks_udf.get_udf()(*features)
+            transformed_result = tf.hopsworks_udf.get_udf(force_python_udf=True)(
+                *features
+            )  # Get only python compatible UDF irrespective of engine
             if isinstance(transformed_result, pd.Series):
                 row_dict[transformed_result.name] = transformed_result.values[0]
             else:
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index e88a530c90..e113015dc9 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -1011,14 +1011,15 @@ def _prepare_transform_split_df(
                 training_dataset_obj,
             )
 
-        if training_dataset_version is None:
-            transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
-                training_dataset_obj, feature_view_obj, result_dfs
-            )
-        else:
-            transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
-                training_dataset_obj, feature_view_obj, training_dataset_version
-            )
+        # TODO : Currently statistics always computed since in memory training dataset retrieved is not consistent
+        # if training_dataset_version is None:
+        transformation_function_engine.TransformationFunctionEngine.compute_and_set_feature_statistics(
+            training_dataset_obj, feature_view_obj, result_dfs
+        )
+        # else:
+        #    transformation_function_engine.TransformationFunctionEngine.get_and_set_feature_statistics(
+        #        training_dataset_obj, feature_view_obj, training_dataset_version
+        #    )
         # and the apply them
         for split_name in result_dfs:
             result_dfs[split_name] = self._apply_transformation_function(
@@ -1290,6 +1291,21 @@ def add_file(self, file: Optional[str]) -> Optional[str]:
                 f.write(bytesio_object.getbuffer())
         return local_file
 
+    def _check_pyarrow_extension(self):
+        """
+        Function to check if pyarrow extension should be used for copying polars dataframe to pandas
+        """
+        try:
+            import pandas as pd
+            from packaging.version import Version
+
+            if Version(pd.__version__) > Version("2.0"):
+                return True
+            else:
+                return False
+        except Exception:
+            return False  # Return false if pyarrow or pandas cannot be imported
+
     def _apply_transformation_function(
         self,
         transformation_functions: List[transformation_function.TransformationFunction],
@@ -1312,7 +1328,7 @@ def _apply_transformation_function(
             dataset, pl.dataframe.frame.DataFrame
         ):
             # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions.
-            if os.getenv("USE_PYARROW_EXTENSION", False):
+            if self._check_pyarrow_extension():
                 dataset = dataset.to_pandas(
                     use_pyarrow_extension_array=True
                 )  # Zero copy if pyarrow extension can be used.
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 848252cb64..5400d5d08d 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -1278,7 +1278,7 @@ def create_training_dataset(
     @usage.method_logger
     def create_transformation_function(
         self,
-        transformation_function: callable,
+        transformation_function: HopsworksUdf,
         version: Optional[int] = None,
     ) -> "TransformationFunction":
         """Create a transformation function metadata object.
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 0091e50481..328ac3c091 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -150,10 +150,10 @@ def __init__(
             )
         )
 
-        self._output_column_names: List[str] = self._get_output_column_names()
-
         self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = None
 
+        self._output_column_names: List[str] = self._get_output_column_names()
+
     @staticmethod
     def _validate_and_convert_output_types(
         output_types: Union[List[type], List[str]],
@@ -496,21 +496,21 @@ def update_return_type_one_hot(self):
         ]
         self.output_column_names = self._get_output_column_names()
 
-    def get_udf(self) -> Callable:
+    def get_udf(self, force_python_udf: bool = False) -> Callable:
         """
         Function that checks the current engine type and returns the appropriate UDF.
 
         In the spark engine the UDF is returned as a pandas UDF.
         While in the python engine the UDF is returned as python function.
 
+        # Arguments
+            force_python_udf: `bool`. Force return a python compatible udf irrespective of engine.
+
         # Returns
             `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF.
         """
-        # Update the number of outputs for one hot encoder to match the number of unique values for the feature
-        if self.function_name == "one_hot_encoder":
-            self.update_return_type_one_hot()
 
-        if engine.get_type() in ["hive", "python", "training"]:
+        if engine.get_type() in ["hive", "python", "training"] or force_python_udf:
             return self.hopsworksUdf_wrapper()
         else:
             from pyspark.sql.functions import pandas_udf
@@ -581,6 +581,9 @@ def from_response_json(
     @property
     def output_types(self) -> List[str]:
         """Get the output types of the UDF"""
+        # Update the number of outputs for one hot encoder to match the number of unique values for the feature
+        if self.function_name == "one_hot_encoder" and self.transformation_statistics:
+            self.update_return_type_one_hot()
         return self._output_types
 
     @property
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index 55267cc7ce..4b883f8ed2 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -2477,6 +2477,7 @@ def test_prepare_transform_split_df_random_split(self, mocker):
         mocker.patch(
             "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
         )
+        mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView")
 
         python_engine = python.Engine()
 
@@ -2504,7 +2505,7 @@ def test_prepare_transform_split_df_random_split(self, mocker):
         result = python_engine._prepare_transform_split_df(
             query_obj=q,
             training_dataset_obj=td,
-            feature_view_obj=None,
+            feature_view_obj=mock_feature_view,
             read_option=None,
             dataframe_type="default",
         )
@@ -2525,6 +2526,7 @@ def test_prepare_transform_split_df_time_split_td_features(self, mocker):
         mocker.patch(
             "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
         )
+        mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView")
 
         python_engine = python.Engine()
 
@@ -2570,7 +2572,7 @@ def test_prepare_transform_split_df_time_split_td_features(self, mocker):
         result = python_engine._prepare_transform_split_df(
             query_obj=q,
             training_dataset_obj=td,
-            feature_view_obj=None,
+            feature_view_obj=mock_feature_view,
             read_option=None,
             dataframe_type="default",
         )
@@ -2591,6 +2593,7 @@ def test_prepare_transform_split_df_time_split_query_features(self, mocker):
         mocker.patch(
             "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
         )
+        mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView")
 
         python_engine = python.Engine()
 
@@ -2635,7 +2638,7 @@ def test_prepare_transform_split_df_time_split_query_features(self, mocker):
         result = python_engine._prepare_transform_split_df(
             query_obj=q,
             training_dataset_obj=td,
-            feature_view_obj=None,
+            feature_view_obj=mock_feature_view,
             read_option=None,
             dataframe_type="default",
         )

From 5ea3e43da150d71c1b764def668059f0409759a3 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 27 May 2024 09:36:47 +0200
Subject: [PATCH 36/58] using PYARROW_EXTENSION_ENABLE during import rather
 than as a function

---
 python/hsfs/engine/python.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index e113015dc9..cc50428632 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -102,6 +102,18 @@
 except ImportError:
     pass
 
+PYARROW_EXTENSION_ENABLE = False
+try:
+    import pandas as pd
+    from packaging.version import Version
+
+    if Version(pd.__version__) > Version("2.0"):
+        PYARROW_EXTENSION_ENABLE = True
+    else:
+        PYARROW_EXTENSION_ENABLE = False
+except Exception:
+    PYARROW_EXTENSION_ENABLE = False  # Set PYARROW_EXTENSION_ENABLE to false if pyarrow or pandas cannot be imported
+
 # Decimal types are currently not supported
 _INT_TYPES = [pa.uint8(), pa.uint16(), pa.int8(), pa.int16(), pa.int32()]
 _BIG_INT_TYPES = [pa.uint32(), pa.int64()]
@@ -1291,21 +1303,6 @@ def add_file(self, file: Optional[str]) -> Optional[str]:
                 f.write(bytesio_object.getbuffer())
         return local_file
 
-    def _check_pyarrow_extension(self):
-        """
-        Function to check if pyarrow extension should be used for copying polars dataframe to pandas
-        """
-        try:
-            import pandas as pd
-            from packaging.version import Version
-
-            if Version(pd.__version__) > Version("2.0"):
-                return True
-            else:
-                return False
-        except Exception:
-            return False  # Return false if pyarrow or pandas cannot be imported
-
     def _apply_transformation_function(
         self,
         transformation_functions: List[transformation_function.TransformationFunction],
@@ -1328,7 +1325,7 @@ def _apply_transformation_function(
             dataset, pl.dataframe.frame.DataFrame
         ):
             # Converting polars dataframe to pandas because currently we support only pandas UDF's as transformation functions.
-            if self._check_pyarrow_extension():
+            if PYARROW_EXTENSION_ENABLE:
                 dataset = dataset.to_pandas(
                     use_pyarrow_extension_array=True
                 )  # Zero copy if pyarrow extension can be used.

From 58678bc409c32ef6948edf897a383ea341301639 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 27 May 2024 13:15:00 +0200
Subject: [PATCH 37/58] skiping transformation function test in windows spark
 udf failing due to dependencies with greater expectation

---
 .../engine/test_python_spark_transformation_functions.py    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
index f5763ea548..4929312bec 100644
--- a/python/tests/engine/test_python_spark_transformation_functions.py
+++ b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -16,6 +16,7 @@
 from __future__ import annotations
 
 import datetime
+import os
 import statistics
 
 import pandas as pd
@@ -44,6 +45,11 @@
 )
 
 
+# TODO : Remove skipping UT in windows after Greater expectations has been upgraded to 1.0 or after it has been made optional
+@pytest.mark.skipif(
+    os.name == "nt",
+    reason="Skip tests in windows since it fails due to dependency problem with greater expectations 0.18.2, Fixed on upgrading to 1.0",
+)
 class TestPythonSparkTransformationFunctions:
     def _create_training_dataset(self):
         f = training_dataset_feature.TrainingDatasetFeature(

From be5036b85b7c20a6ccea5076b4f0aa5a8604d066 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 27 May 2024 16:45:24 +0200
Subject: [PATCH 38/58] changing transformed_feature_vector_col_name to
 transformed_features to obtain feature names after transfromations

---
 python/hsfs/core/vector_server.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 3a90387390..118ecca5e7 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -31,25 +31,26 @@
     client,
     feature_view,
     training_dataset,
+    transformation_function,
 )
 from hsfs import (
     serving_key as sk_mod,
 )
-from hsfs import (
-    training_dataset_feature as tdf_mod,
-)
+from hsfs import training_dataset_feature as tdf_mod
 from hsfs.client import exceptions, online_store_rest_client
 from hsfs.core import (
     online_store_rest_client_engine,
     online_store_sql_engine,
+)
+from hsfs.core import (
     transformation_function_engine as tf_engine_mod,
-    transformation_function
 )
 
 
 HAS_FASTAVRO = False
 try:
     from fastavro import schemaless_reader
+
     HAS_FASTAVRO = True
 except ImportError:
     from avro.io import BinaryDecoder
@@ -106,8 +107,9 @@ def __init__(
         self._transformation_function_engine = (
             tf_engine_mod.TransformationFunctionEngine(feature_store_id)
         )
-        self._transformation_functions: List[transformation_function.TransformationFunction] = []
-
+        self._transformation_functions: List[
+            transformation_function.TransformationFunction
+        ] = []
         self._sql_client = None
 
         self._rest_client_engine = None
@@ -429,7 +431,7 @@ def assemble_feature_vector(
 
         _logger.debug("Assembled and transformed dict feature vector: %s", result_dict)
 
-        return [result_dict.get(fname, None) for fname in self.transformed_feature_vector_col_name]
+        return [result_dict.get(fname, None) for fname in self.transformed_features]
 
     def handle_feature_vector_return_type(
         self,
@@ -563,7 +565,7 @@ def get_inference_helpers(
         return self.handle_feature_vector_return_type(
             batch_results, batch=True, inference_helper=True, return_type=return_type
         )
-    
+
     def which_client_and_ensure_initialised(
         self, force_rest_client: bool, force_sql_client: bool
     ) -> str:
@@ -626,7 +628,7 @@ def _set_default_client(
             self._init_sql_client = True
 
     def apply_transformation(self, row_dict: dict):
-        _logger.debug("Applying transformation functions to : %s", matching_keys)
+        _logger.debug("Applying transformation functions.")
         for tf in self.transformation_functions:
             features = [
                 pd.Series(row_dict[feature])
@@ -1060,7 +1062,7 @@ def default_client(self, default_client: Literal["rest", "sql"]):
         _logger.debug(f"Default Online Store Client is set to {default_client}.")
         self._default_client = default_client
 
-    def transformed_feature_vector_col_name(self):
+    def transformed_features(self):
         if self._transformed_feature_vector_col_name is None:
             transformation_features = []
             output_column_names = []

From 3a01eadb0e4bad5257bc56c53f4ac1c7e467b0a9 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 27 May 2024 17:21:14 +0200
Subject: [PATCH 39/58] adding property transformed_features in feature view to
 obtain feature names after transfromations

---
 python/hsfs/core/vector_server.py |  7 +++++--
 python/hsfs/feature_view.py       | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 118ecca5e7..5a344db890 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -431,7 +431,10 @@ def assemble_feature_vector(
 
         _logger.debug("Assembled and transformed dict feature vector: %s", result_dict)
 
-        return [result_dict.get(fname, None) for fname in self.transformed_features]
+        return [
+            result_dict.get(fname, None)
+            for fname in self.transformed_feature_vector_col_name
+        ]
 
     def handle_feature_vector_return_type(
         self,
@@ -1062,7 +1065,7 @@ def default_client(self, default_client: Literal["rest", "sql"]):
         _logger.debug(f"Default Online Store Client is set to {default_client}.")
         self._default_client = default_client
 
-    def transformed_features(self):
+    def transformed_feature_vector_col_name(self):
         if self._transformed_feature_vector_col_name is None:
             transformation_features = []
             output_column_names = []
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index a8e51c3b69..9ca317a473 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -3709,3 +3709,18 @@ def serving_keys(self) -> List[skm.ServingKey]:
     @serving_keys.setter
     def serving_keys(self, serving_keys: List[skm.ServingKey]) -> None:
         self._serving_keys = serving_keys
+
+    @property
+    def transformed_features(self) -> List[str]:
+        """Name of features of a feature view after transformation functions have been applied"""
+        transformation_features = set()
+        transformed_column_names = []
+        for tf in self.transformation_functions:
+            transformed_column_names.extend(tf.output_column_names)
+            transformation_features.update(tf.hopsworks_udf.transformation_features)
+
+        return [
+            feature.name
+            for feature in self.features
+            if feature.name not in transformation_features
+        ] + transformed_column_names

From 2753ec47328d87d2fb1ec5f633d9cf342470eb12 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 28 May 2024 15:15:41 +0200
Subject: [PATCH 40/58] updating doc string and adding property decorator
 missed during rebase

---
 python/hsfs/core/vector_server.py | 1 +
 python/hsfs/hopsworks_udf.py      | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 5a344db890..9a882523b6 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -1065,6 +1065,7 @@ def default_client(self, default_client: Literal["rest", "sql"]):
         _logger.debug(f"Default Online Store Client is set to {default_client}.")
         self._default_client = default_client
 
+    @property
     def transformed_feature_vector_col_name(self):
         if self._transformed_feature_vector_col_name is None:
             transformation_features = []
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 328ac3c091..246483c9b4 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -35,9 +35,10 @@ def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf":
     Create an User Defined Function that can be and used within the Hopsworks Feature Store.
 
     Hopsworks UDF's are user defined functions that executes as 'pandas_udf' when executing
-    in spark engine and as pandas functions in the python engine. A Hopsworks udf is defined
-    using the `hopsworks_udf` decorator. The outputs of the defined UDF must be mentioned in the
-    decorator as a list of python types.
+    in spark engine and as pandas functions in the python engine. The pandas udf/pandas functions
+    gets as inputs pandas Series's and can provide as output a pandas Series or a pandas DataFrame.
+    A Hopsworks udf is defined using the `hopsworks_udf` decorator. The outputs of the defined UDF
+    must be mentioned in the decorator as a list of python types.
 
 
     !!! example

From 23c7b8a474968ac97977bb0c106c23c00ebacc78 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 7 Jun 2024 09:41:10 +0200
Subject: [PATCH 41/58] refactoring transformation functions to update parsing
 of statistics parameters and also renaming decorator name

---
 python/hsfs/builtin_transformations.py        |  49 ++---
 python/hsfs/engine/spark.py                   |   2 +-
 python/hsfs/feature_store.py                  |   4 +-
 python/hsfs/hopsworks_udf.py                  | 193 +++++++++++-------
 python/hsfs/transformation_function.py        |   4 +-
 python/hsfs/transformation_statistics.py      | 119 +++++++++++
 python/tests/core/test_feature_view_engine.py |   8 +-
 .../core/test_training_dataset_engine.py      |   4 +-
 .../test_transformation_function_engine.py    |  29 +--
 python/tests/engine/test_python.py            |  15 +-
 ...t_python_spark_transformation_functions.py |  54 +++--
 python/tests/engine/test_spark.py             |  12 +-
 .../tests/fixtures/feature_view_fixtures.json |  14 +-
 .../transformation_function_fixtures.json     |  29 +--
 python/tests/test_feature_view.py             |   8 +-
 .../transformation_test_helper.py             |  45 ++--
 python/tests/test_hopswork_udf.py             | 162 ++++++---------
 python/tests/test_transformation_function.py  |  39 ++--
 18 files changed, 469 insertions(+), 321 deletions(-)
 create mode 100644 python/hsfs/transformation_statistics.py

diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
index d17126bd44..421a04cffe 100644
--- a/python/hsfs/builtin_transformations.py
+++ b/python/hsfs/builtin_transformations.py
@@ -16,39 +16,36 @@
 
 import numpy as np
 import pandas as pd
-from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
+from hsfs.transformation_statistics import TransformationStatistics
 
 
-@hopsworks_udf(float)
-def min_max_scaler(feature: pd.Series, statistics_feature) -> pd.Series:
-    return (feature - statistics_feature.min) / (
-        statistics_feature.max - statistics_feature.min
+feature_statistics = TransformationStatistics("feature")
+
+
+@udf(float)
+def min_max_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
+    return (feature - statistics.feature.min) / (
+        statistics.feature.max - statistics.feature.min
     )
 
 
-@hopsworks_udf(float)
-def standard_scaler(
-    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
-) -> pd.Series:
-    return (feature - statistics_feature.mean) / statistics_feature.stddev
+@udf(float)
+def standard_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
+    return (feature - statistics.feature.mean) / statistics.feature.stddev
 
 
-@hopsworks_udf(float)
-def robust_scaler(
-    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
-) -> pd.Series:
-    return (feature - statistics_feature.percentiles[49]) / (
-        statistics_feature.percentiles[74] - statistics_feature.percentiles[24]
+@udf(float)
+def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
+    return (feature - statistics.feature.percentiles[49]) / (
+        statistics.feature.percentiles[74] - statistics.feature.percentiles[24]
     )
 
 
-@hopsworks_udf(int)
-def label_encoder(
-    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
-) -> pd.Series:
+@udf(int)
+def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     unique_data = [
-        value for value in statistics_feature.extended_statistics["unique_values"]
+        value for value in statistics.feature.extended_statistics["unique_values"]
     ]
     value_to_index = {value: index for index, value in enumerate(unique_data)}
     return pd.Series(
@@ -56,12 +53,10 @@ def label_encoder(
     )
 
 
-@hopsworks_udf(bool)
-def one_hot_encoder(
-    feature: pd.Series, statistics_feature: FeatureDescriptiveStatistics
-) -> pd.Series:
+@udf(bool)
+def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     unique_data = [
-        value for value in statistics_feature.extended_statistics["unique_values"]
+        value for value in statistics.feature.extended_statistics["unique_values"]
     ]
     one_hot = pd.get_dummies(feature, dtype="bool")
     for data in unique_data:
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index c462efa641..a22be38cc0 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -1269,7 +1269,7 @@ def _apply_transformation_function(
             output_col_names.append(output_col_name)
             transformation_features.append(hopsworks_udf.transformation_features)
 
-            if len(hopsworks_udf.output_types) > 1:
+            if len(hopsworks_udf.return_types) > 1:
                 explode_name.append(f"{output_col_name}.*")
             else:
                 explode_name.append(output_col_name)
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 5400d5d08d..11eeac1983 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -1286,7 +1286,7 @@ def create_transformation_function(
         !!! example
             ```python
             # define the transformation function as a Hopsworks's UDF
-            @hopsworks_udf(int)
+            @udf(int)
             def plus_one(value):
                 return value + 1
 
@@ -1464,7 +1464,7 @@ def create_feature_view(
             query = fg1.select_all().join(fg2.select_all())
 
             # define the transformation function as a Hopsworks's UDF
-            @hopsworks_udf(int)
+            @udf(int)
             def plus_one(value):
                 return value + 1
 
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 246483c9b4..e287089545 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -28,9 +28,10 @@
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 from hsfs.decorators import typechecked
+from hsfs.transformation_statistics import TransformationStatistics
 
 
-def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf":
+def udf(return_type: Union[List[type], type]) -> "HopsworksUdf":
     """
     Create an User Defined Function that can be and used within the Hopsworks Feature Store.
 
@@ -43,15 +44,15 @@ def hopsworks_udf(output_type: Union[List[type], type]) -> "HopsworksUdf":
 
     !!! example
         ```python
-        from hsfs.hopsworks_udf import hopsworks_udf
+        from hsfs.hopsworks_udf import udf
 
-       @hopsworks_udf(float)
+       @udf(float)
         def add_one(data1 : pd.Series):
             return data1 + 1
         ```
 
     # Arguments
-        output_type: `list`. The output types of the defined UDF
+        return_type: `list`. The output types of the defined UDF
 
     # Returns
         `HopsworksUdf`: The metadata object for hopsworks UDF's.
@@ -61,7 +62,7 @@ def add_one(data1 : pd.Series):
     """
 
     def wrapper(func: Callable) -> HopsworksUdf:
-        udf = HopsworksUdf(func=func, output_types=output_type)
+        udf = HopsworksUdf(func=func, return_types=return_type)
         return udf
 
     return wrapper
@@ -123,12 +124,12 @@ class HopsworksUdf:
     def __init__(
         self,
         func: Union[Callable, str],
-        output_types: Union[List[type], type, List[str], str],
+        return_types: Union[List[type], type, List[str], str],
         name: Optional[str] = None,
         transformation_features: Optional[List[TransformationFeature]] = None,
     ):
-        self._output_types: List[str] = HopsworksUdf._validate_and_convert_output_types(
-            output_types
+        self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types(
+            return_types
         )
 
         self._function_name: str = func.__name__ if name is None else name
@@ -138,20 +139,20 @@ def __init__(
             if isinstance(func, Callable)
             else func
         )
-
-        self._transformation_features: List[TransformationFeature] = (
-            HopsworksUdf._extract_function_arguments(self._function_source)
-            if not transformation_features
-            else transformation_features
-        )
+        if not transformation_features:
+            self._transformation_features: List[TransformationFeature] = (
+                HopsworksUdf._extract_function_arguments(func)
+                if not transformation_features
+                else transformation_features
+            )
+        else:
+            self._transformation_features = transformation_features
 
         self._formatted_function_source, self._module_imports = (
-            HopsworksUdf._format_source_code(
-                self._function_source, self._transformation_features
-            )
+            HopsworksUdf._format_source_code(self._function_source)
         )
 
-        self._statistics: Optional[Dict[str, FeatureDescriptiveStatistics]] = None
+        self._statistics: Optional[TransformationStatistics] = None
 
         self._output_column_names: List[str] = self._get_output_column_names()
 
@@ -279,63 +280,67 @@ def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, in
             ]
         )
         arg_list = signature.split("(")[1].split(")")[0].split(",")
-        arg_list = [arg for arg in arg_list if not arg.strip() == ""]
+        arg_list = [arg.split(":")[0].split("=")[0].strip() for arg in arg_list]
+        if "statistics" in arg_list:
+            arg_list.remove("statistics")
         return arg_list, signature, signature_start_line, signature_end_line
 
     @staticmethod
-    def _extract_function_arguments(source_code: str) -> List[TransformationFeature]:
+    def _extract_function_arguments(function: Callable) -> List[TransformationFeature]:
         """
         Function to extract the argument names from a provided function source code.
 
         # Arguments
-            source_code: `str`. Source code of a function.
+            source_code: `Callable`. The function for which the value are to be extracted.
         # Returns
             `List[TransformationFeature]`: List of TransformationFeature that provide a mapping from feature names to corresponding statistics parameters if any is present.
         """
-        # Get source code of the original function
-        arg_list, _, _, _ = HopsworksUdf._parse_function_signature(source_code)
-
-        if arg_list == []:
+        arg_list = []
+        statistics = None
+        signature = inspect.signature(function).parameters
+        if not signature:
             raise FeatureStoreException(
                 "No arguments present in the provided user defined function. Please provide at least one argument in the defined user defined function."
             )
+        for arg in inspect.signature(function).parameters.values():
+            if arg.name == "statistics":
+                statistics = arg.default
+            else:
+                arg_list.append(arg.name)
 
-        arg_list = [arg.split(":")[0].strip() for arg in arg_list]
-
-        for arg in arg_list:
-            if arg.startswith("statistics"):
-                if arg.split("statistics_")[1] not in arg_list:
-                    raise FeatureStoreException(
-                        f"No argument corresponding to statistics parameter '{arg}' present in function definition."
-                    )
-
-        return [
-            TransformationFeature(
-                arg, f"statistics_{arg}" if f"statistics_{arg}" in arg_list else None
-            )
-            for arg in arg_list
-            if not arg.startswith("statistics")
-        ]
+        if statistics:
+            missing_statistic_features = [
+                statistic_feature
+                for statistic_feature in statistics._features
+                if statistic_feature not in arg_list
+            ]
+            if missing_statistic_features:
+                missing_statistic_features = "', '".join(missing_statistic_features)
+                raise FeatureStoreException(
+                    f"No argument corresponding to statistics parameter '{missing_statistic_features}' present in function definition."
+                )
+            return [
+                TransformationFeature(arg, arg if arg in statistics._features else None)
+                for arg in arg_list
+            ]
+        else:
+            return [TransformationFeature(arg, None) for arg in arg_list]
 
     @staticmethod
-    def _format_source_code(
-        source_code: str, transformation_features: List[TransformationFeature]
-    ) -> Tuple[str, str]:
+    def _format_source_code(source_code: str) -> Tuple[str, str]:
         """
         Function that parses the existing source code to remove statistics parameter and remove all decorators and type hints from the function source code.
 
         # Arguments
             source_code: `str`. Source code of a function.
-            transformation_features `List[TransformationFeature]`: List of transformation features provided in the function argument.
         # Returns
             `Tuple[str, str]`: Tuple that contains Source code that does not contain any decorators, type hints or statistics parameters and the module imports
         """
 
-        _, signature, _, signature_end_line = HopsworksUdf._parse_function_signature(
-            source_code
+        arg_list, signature, _, signature_end_line = (
+            HopsworksUdf._parse_function_signature(source_code)
         )
         module_imports = source_code.split("@")[0]
-        arg_list = [feature.feature_name for feature in transformation_features]
 
         # Reconstruct the function signature
         new_signature = (
@@ -359,8 +364,8 @@ def _get_output_column_names(self) -> str:
         _BASE_COLUMN_NAME = (
             f'{self.function_name}_{"-".join(self.transformation_features)}_'
         )
-        if len(self.output_types) > 1:
-            return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.output_types))]
+        if len(self.return_types) > 1:
+            return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))]
         else:
             return [f"{_BASE_COLUMN_NAME}"]
 
@@ -371,15 +376,15 @@ def _create_pandas_udf_return_schema_from_list(self) -> str:
         # Returns
             `str`: DDL-formatted type string that denotes the return types of the user defined function.
         """
-        if len(self.output_types) > 1:
+        if len(self.return_types) > 1:
             return ", ".join(
                 [
-                    f"`{self.output_column_names[i]}` {self.output_types[i]}"
-                    for i in range(len(self.output_types))
+                    f"`{self.output_column_names[i]}` {self.return_types[i]}"
+                    for i in range(len(self.return_types))
                 ]
             )
         else:
-            return self.output_types[0]
+            return self.return_types[0]
 
     def hopsworksUdf_wrapper(self) -> Callable:
         """
@@ -405,7 +410,7 @@ def hopsworksUdf_wrapper(self) -> Callable:
             return date_time_col.dt.tz_localize(None).dt.tz_localize(str(current_timezone))"""
 
         # Defining wrapper function that renames the column names to specific names
-        if len(self.output_types) > 1:
+        if len(self.return_types) > 1:
             code = (
                 self._module_imports
                 + "\n"
@@ -440,7 +445,7 @@ def renaming_wrapper(*args):
         # Shallow copy of scope performed because updating statistics argument of scope must not affect other instances.
         scope = __import__("__main__").__dict__.copy()
         if self.transformation_statistics is not None:
-            scope.update(self.transformation_statistics)
+            scope.update({"statistics": self.transformation_statistics})
         scope.update({"_output_col_names": self.output_column_names})
         # executing code
         exec(code, scope)
@@ -485,8 +490,8 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
         return udf
 
     def update_return_type_one_hot(self):
-        self._output_types = [
-            self._output_types[0]
+        self._return_types = [
+            self._return_types[0]
             for _ in range(
                 len(
                     self.transformation_statistics[
@@ -530,8 +535,11 @@ def to_dict(self) -> Dict[str, Any]:
         """
         return {
             "sourceCode": self._function_source,
-            "outputTypes": self.output_types,
+            "outputTypes": self.return_types,
             "transformationFeatures": self.transformation_features,
+            "statisticsArgumentNames": self._statistics_argument_names
+            if self.statistics_required
+            else None,
             "name": self._function_name,
         }
 
@@ -568,24 +576,51 @@ def from_response_json(
             feature.strip()
             for feature in json_decamelized["transformation_features"].split(",")
         ]
+        statistics_features = (
+            [
+                feature.strip()
+                for feature in json_decamelized["statistics_argument_names"].split(",")
+            ]
+            if "statistics_argument_names" in json_decamelized
+            else None
+        )
+
+        # Reconstructing statistics arguments.
+        arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code)
+
+        if statistics_features:
+            transformation_features = [
+                TransformationFeature(
+                    transformation_features[arg_index],
+                    arg_list[arg_index]
+                    if arg_list[arg_index] in statistics_features
+                    else None,
+                )
+                for arg_index in range(len(arg_list))
+            ]
+        else:
+            transformation_features = [
+                TransformationFeature(transformation_features[arg_index], None)
+                for arg_index in range(len(arg_list))
+            ]
 
         hopsworks_udf = cls(
-            func=function_source_code, output_types=output_types, name=function_name
+            func=function_source_code,
+            return_types=output_types,
+            name=function_name,
+            transformation_features=transformation_features,
         )
 
         # Set transformation features if already set.
-        if "" not in transformation_features:
-            return hopsworks_udf(*transformation_features)
-        else:
-            return hopsworks_udf
+        return hopsworks_udf
 
     @property
-    def output_types(self) -> List[str]:
+    def return_types(self) -> List[str]:
         """Get the output types of the UDF"""
         # Update the number of outputs for one hot encoder to match the number of unique values for the feature
         if self.function_name == "one_hot_encoder" and self.transformation_statistics:
             self.update_return_type_one_hot()
-        return self._output_types
+        return self._return_types
 
     @property
     def function_name(self) -> str:
@@ -600,7 +635,7 @@ def statistics_required(self) -> bool:
     @property
     def transformation_statistics(
         self,
-    ) -> Optional[Dict[str, FeatureDescriptiveStatistics]]:
+    ) -> Optional[TransformationStatistics]:
         """Feature statistics required for the defined UDF"""
         return self._statistics
 
@@ -640,24 +675,34 @@ def _statistics_argument_mapping(self) -> Dict[str, str]:
             for transformation_feature in self._transformation_features
         }
 
+    @property
+    def _statistics_argument_names(self) -> List[str]:
+        """
+        list of argument names required for statistics
+        """
+        return [
+            transformation_feature.statistic_argument_name
+            for transformation_feature in self._transformation_features
+            if transformation_feature.statistic_argument_name is not None
+        ]
+
     @transformation_statistics.setter
     def transformation_statistics(
         self, statistics: List[FeatureDescriptiveStatistics]
     ) -> None:
-        self._statistics = dict()
+        self._statistics = TransformationStatistics(*self._statistics_argument_names)
         for stat in statistics:
-            if stat.feature_name in self._statistics_argument_mapping.keys():
-                self._statistics[
-                    self._statistics_argument_mapping[stat.feature_name]
-                ] = stat
+            self._statistics.set_statistics(
+                self._statistics_argument_mapping[stat.feature_name], stat.to_dict()
+            )
 
     @output_column_names.setter
     def output_column_names(self, output_col_names: Union[str, List[str]]) -> None:
         if not isinstance(output_col_names, List):
             output_col_names = [output_col_names]
-        if len(output_col_names) != len(self.output_types):
+        if len(output_col_names) != len(self.return_types):
             raise FeatureStoreException(
-                f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.output_types)} names."
+                f"Provided names for output columns does not match the number of columns returned from the UDF. Please provide {len(self.return_types)} names."
             )
         else:
             self._output_column_names = output_col_names
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index 3267b4d14a..a3f6a295d7 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -74,7 +74,7 @@ def save(self) -> None:
             # import hopsworks udf decorator
             from hsfs.hopsworks_udf import HopsworksUdf
             # define function
-            @hopsworks_udf(int)
+            @udf(int)
             def plus_one(value):
                 return value + 1
 
@@ -98,7 +98,7 @@ def delete(self) -> None:
             # import hopsworks udf decorator
             from hsfs.hopsworks_udf import HopsworksUdf
             # define function
-            @hopsworks_udf(int)
+            @udf(int)
             def plus_one(value):
                 return value + 1
 
diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py
new file mode 100644
index 0000000000..f4b6b1c0e5
--- /dev/null
+++ b/python/hsfs/transformation_statistics.py
@@ -0,0 +1,119 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, Mapping, Optional, Union
+
+import humps
+
+
+@dataclass
+class FeatureTransformationStatistics:
+    """
+    Data class that contains all the statistics parameters that can be used for transformations.
+    """
+
+    feature_name: str
+    count: int = None
+    # for any feature type
+    completeness: Optional[float] = None
+    num_non_null_values: Optional[int] = None
+    num_null_values: Optional[int] = None
+    approx_num_distinct_values: Optional[int] = None
+    # for numerical features
+    min: Optional[float] = None
+    max: Optional[float] = None
+    sum: Optional[float] = None
+    mean: Optional[float] = None
+    stddev: Optional[float] = None
+    percentiles: Optional[Mapping[str, float]] = None
+    # with exact uniqueness
+    distinctness: Optional[float] = None
+    entropy: Optional[float] = None
+    uniqueness: Optional[float] = None
+    exact_num_distinct_values: Optional[int] = None
+    extended_statistics: Optional[Union[dict, str]] = None
+
+    def __init__(
+        self,
+        feature_name: str,
+        count: int = None,
+        completeness: Optional[float] = None,
+        num_non_null_values: Optional[int] = None,
+        num_null_values: Optional[int] = None,
+        approx_num_distinct_values: Optional[int] = None,
+        min: Optional[float] = None,
+        max: Optional[float] = None,
+        sum: Optional[float] = None,
+        mean: Optional[float] = None,
+        stddev: Optional[float] = None,
+        percentiles: Optional[Mapping[str, float]] = None,
+        distinctness: Optional[float] = None,
+        entropy: Optional[float] = None,
+        uniqueness: Optional[float] = None,
+        exact_num_distinct_values: Optional[int] = None,
+        extended_statistics: Optional[Union[dict, str]] = None,
+        **kwargs,
+    ):
+        self.feature_name = feature_name
+        self.count = count
+        self.completeness = completeness
+        self.num_non_null_values = num_non_null_values
+        self.num_null_values = num_null_values
+        self.approx_num_distinct_values = approx_num_distinct_values
+        self.min = min
+        self.max = max
+        self.sum = sum
+        self.mean = mean
+        self.stddev = stddev
+        self.percentiles = percentiles
+        self.distinctness = distinctness
+        self.entropy = entropy
+        self.uniqueness = uniqueness
+        self.exact_num_distinct_values = exact_num_distinct_values
+        self.extended_statistics = extended_statistics
+
+    @classmethod
+    def from_response_json(
+        cls: FeatureTransformationStatistics, json_dict: Dict[str, Any]
+    ):
+        json_decamelized = humps.decamelize(json_dict)
+        return cls(**json_decamelized)
+
+
+class TransformationStatistics:
+    """
+    Class that stores statistics of all features required for a transformation function.
+    """
+
+    def __init__(self, *features):
+        self._features = features
+        self.__dict__.update(
+            {feature: self.init_statistics(feature) for feature in features}
+        )
+
+    def init_statistics(self, feature_name):
+        return FeatureTransformationStatistics(feature_name=feature_name)
+
+    def set_statistics(self, feature_name, statistics: Dict[str, Any]):
+        self.__dict__[feature_name] = (
+            FeatureTransformationStatistics.from_response_json(statistics)
+        )
+
+    def __repr__(self) -> str:
+        return ",\n ".join([repr(self.__dict__[feature]) for feature in self._features])
diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py
index b1fb7ee08a..f6a141fb20 100644
--- a/python/tests/core/test_feature_view_engine.py
+++ b/python/tests/core/test_feature_view_engine.py
@@ -29,7 +29,7 @@
 from hsfs.constructor.query import Query
 from hsfs.core import arrow_flight_client, feature_view_engine
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 from hsfs.storage_connector import BigQueryConnector, StorageConnector
 from hsfs.transformation_function import TransformationFunction
 
@@ -565,7 +565,7 @@ def test_get_attached_transformation_fn(self, mocker):
             feature_store_id=feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def test2(col1):
             return col1 + 1
 
@@ -593,7 +593,7 @@ def test_get_attached_transformation_fn_multiple(self, mocker):
             feature_store_id=feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def test1(col1):
             return col1 + 1
 
@@ -602,7 +602,7 @@ def test1(col1):
             hopsworks_udf=test1,
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def test2(col1):
             return col1 + 2
 
diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py
index 5e77445971..fea3d43f88 100644
--- a/python/tests/core/test_training_dataset_engine.py
+++ b/python/tests/core/test_training_dataset_engine.py
@@ -23,7 +23,7 @@
 )
 from hsfs.constructor import query
 from hsfs.core import training_dataset_engine
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 
 
 class TestTrainingDatasetEngine:
@@ -118,7 +118,7 @@ def test_save_transformation_functions(self, mocker):
         mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
         mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
 
-        @hopsworks_udf(int)
+        @udf(int)
         def plus_one(a):
             return a + 1
 
diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py
index 51dd623ef1..11cd593cc3 100644
--- a/python/tests/core/test_transformation_function_engine.py
+++ b/python/tests/core/test_transformation_function_engine.py
@@ -24,7 +24,7 @@
     transformation_function,
 )
 from hsfs.core import transformation_function_engine
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 
 
 fg1 = feature_group.FeatureGroup(
@@ -91,7 +91,7 @@ def test_save(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction(col1):
             return col1 + 1
 
@@ -118,7 +118,7 @@ def test_get_transformation_fn(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction1(col1):
             return col1 + 1
 
@@ -127,7 +127,7 @@ def testFunction1(col1):
             hopsworks_udf=testFunction1,
         )
 
-        @hopsworks_udf(float)
+        @udf(float)
         def testFunction2(data2, statistics_data2):
             return data2 + 1
 
@@ -159,7 +159,7 @@ def test_get_transformation_fns(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction1(col1):
             return col1 + 1
 
@@ -168,7 +168,7 @@ def testFunction1(col1):
             hopsworks_udf=testFunction1,
         )
 
-        @hopsworks_udf(float)
+        @udf(float)
         def testFunction2(data2, statistics_data2):
             return data2 + 1
 
@@ -200,7 +200,7 @@ def test_delete(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction1(col1):
             return col1 + 1
 
@@ -259,7 +259,7 @@ def test_compute_and_set_feature_statistics_no_split(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction1(col1):
             return col1 + 1
 
@@ -318,7 +318,7 @@ def test_compute_and_set_feature_statistics_train_test_split(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction1(col1):
             return col1 + 1
 
@@ -376,7 +376,7 @@ def test_get_and_set_feature_statistics_no_statistics_required(self, mocker):
             feature_store_id
         )
 
-        @hopsworks_udf(int)
+        @udf(int)
         def testFunction1(col1):
             return col1 + 1
 
@@ -428,10 +428,13 @@ def test_get_and_set_feature_statistics_statistics_required(self, mocker):
         tf_engine = transformation_function_engine.TransformationFunctionEngine(
             feature_store_id
         )
+        from hsfs.transformation_statistics import TransformationStatistics
 
-        @hopsworks_udf(int)
-        def testFunction1(col1, statistics_col1):
-            return col1 + statistics_col1.mean
+        stats = TransformationStatistics("col1")
+
+        @udf(int)
+        def testFunction1(col1, statistics=stats):
+            return col1 + statistics.col1.mean
 
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index 4b883f8ed2..4796ad2cfe 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -36,7 +36,7 @@
 from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias
 from hsfs.core import inode, job
 from hsfs.engine import python
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 from hsfs.training_dataset_feature import TrainingDatasetFeature
 from polars.testing import assert_frame_equal as polars_assert_frame_equal
 
@@ -3240,7 +3240,7 @@ def test_apply_transformation_function_pandas(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
-        @hopsworks_udf(int)
+        @udf(int)
         def plus_one(col1):
             return col1 + 1
 
@@ -3280,7 +3280,7 @@ def test_apply_transformation_function_multiple_output(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
-        @hopsworks_udf([int, int])
+        @udf([int, int])
         def plus_two(col1):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2})
 
@@ -3324,7 +3324,7 @@ def test_apply_transformation_function_multiple_input_output(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
-        @hopsworks_udf([int, int])
+        @udf([int, int])
         def plus_two(col1, col2):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
 
@@ -3368,7 +3368,7 @@ def test_apply_transformation_function_polars(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
-        @hopsworks_udf(int)
+        @udf(int)
         def plus_one(col1):
             return col1 + 1
 
@@ -3896,7 +3896,10 @@ def test_materialization_kafka_skip_offsets(self, mocker):
         python_engine._write_dataframe_kafka(
             feature_group=fg,
             dataframe=df,
-            offline_write_options={"start_offline_materialization": True, "skip_offsets": True},
+            offline_write_options={
+                "start_offline_materialization": True,
+                "skip_offsets": True,
+            },
         )
 
         # Assert
diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
index 4929312bec..cf0d529611 100644
--- a/python/tests/engine/test_python_spark_transformation_functions.py
+++ b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -31,7 +31,7 @@
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 from hsfs.engine import python, spark
-from hsfs.hopsworks_udf import HopsworksUdf, hopsworks_udf
+from hsfs.hopsworks_udf import HopsworksUdf, udf
 from pyspark.sql.types import (
     BooleanType,
     DateType,
@@ -148,15 +148,18 @@ def test_apply_builtin_minmax_from_backend(self, mocker):
 
         # Arrange
         tf_fun_source = (
-            "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n"
-            "from hsfs.hopsworks_udf import hopsworks_udf\n"
-            "@hopsworks_udf(float)\ndef min_max_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n"
-            "    return (feature - statistics_feature.min)/(statistics_feature.max-statistics_feature.min)\n"
+            "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n"
+            "from hsfs.hopsworks_udf import udf\n"
+            'feature_statistics = TransformationStatistics("feature")\n'
+            "@udf(float)\n"
+            "def min_max_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n"
+            "    return (feature - statistics.feature.min) / (statistics.feature.max - statistics.feature.min)"
         )
         udf_response = {
             "sourceCode": tf_fun_source,
             "outputTypes": "double",
             "transformationFeatures": "",
+            "statisticsArgumentNames": "feature",
             "name": "min_max_scaler",
         }
 
@@ -283,15 +286,18 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker):
 
         # Arrange
         tf_fun_source = (
-            "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n"
-            "from hsfs.hopsworks_udf import hopsworks_udf\n"
-            "@hopsworks_udf(float)\ndef standard_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n"
-            "    return (feature - statistics_feature.mean)/statistics_feature.stddev\n"
+            "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n"
+            "from hsfs.hopsworks_udf import udf\n"
+            'feature_statistics = TransformationStatistics("feature")\n'
+            "@udf(float)\n"
+            "def standard_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n"
+            "    return (feature - statistics.feature.mean) / statistics.feature.stddev"
         )
         udf_response = {
             "sourceCode": tf_fun_source,
             "outputTypes": "double",
             "transformationFeatures": "",
+            "statisticsArgumentNames": "feature",
             "name": "standard_scaler",
         }
 
@@ -421,15 +427,19 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker):
 
         # Arrange
         tf_fun_source = (
-            "import pandas as pd\nfrom hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics\n"
-            "from hsfs.hopsworks_udf import hopsworks_udf\n"
-            "@hopsworks_udf(float)\ndef robust_scaler(feature : pd.Series, statistics_feature : FeatureDescriptiveStatistics) -> pd.Series:\n"
-            "    return (feature - statistics_feature.percentiles[49])/(statistics_feature.percentiles[74]-statistics_feature.percentiles[24])\n"
+            "import numpy as np\nimport pandas as pd\nfrom hsfs.transformation_statistics import TransformationStatistics\n"
+            "from hsfs.hopsworks_udf import udf\n"
+            'feature_statistics = TransformationStatistics("feature")\n'
+            "@udf(float)\n"
+            "def robust_scaler(feature: pd.Series, statistics = feature_statistics) -> pd.Series:\n"
+            "    return (feature - statistics.feature.percentiles[49]) / (statistics.feature.percentiles[74] - "
+            "statistics.feature.percentiles[24])"
         )
         udf_response = {
             "sourceCode": tf_fun_source,
             "outputTypes": "double",
             "transformationFeatures": "",
+            "statisticsArgumentNames": "feature",
             "name": "robust_scaler",
         }
 
@@ -561,7 +571,7 @@ def test_apply_plus_one_int(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(int)
+        @udf(int)
         def tf_fun(col_0):
             return col_0 + 1
 
@@ -619,7 +629,7 @@ def test_apply_plus_one_str(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(str)
+        @udf(str)
         def tf_fun(col_0):
             return col_0 + "1"
 
@@ -676,7 +686,7 @@ def test_apply_plus_one_double(self, mocker):
         spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         # Arrange
-        @hopsworks_udf(float)
+        @udf(float)
         def tf_fun(col_0):
             return col_0 + 1.0
 
@@ -748,7 +758,7 @@ def test_apply_plus_one_datetime_no_tz(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(datetime.datetime)
+        @udf(datetime.datetime)
         def tf_fun(col_0):
             import datetime
 
@@ -823,7 +833,7 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(datetime.datetime)
+        @udf(datetime.datetime)
         def tf_fun(col_0) -> datetime.datetime:
             import datetime
 
@@ -901,7 +911,7 @@ def test_apply_plus_one_datetime_tz_pst(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(datetime.datetime)
+        @udf(datetime.datetime)
         def tf_fun(col_0) -> datetime.datetime:
             import datetime
 
@@ -979,7 +989,7 @@ def test_apply_plus_one_datetime_ts_none(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(datetime.datetime)
+        @udf(datetime.datetime)
         def tf_fun(col_0) -> datetime.datetime:
             import datetime
 
@@ -1053,7 +1063,7 @@ def test_apply_plus_one_date(self, mocker):
         )
 
         # Arrange
-        @hopsworks_udf(datetime.date)
+        @udf(datetime.date)
         def tf_fun(col_0):
             import datetime
 
@@ -1079,7 +1089,7 @@ def test_apply_plus_one_invalid_type(self, mocker):
         # Arrange
         with pytest.raises(FeatureStoreException) as e_info:
 
-            @hopsworks_udf(list)
+            @udf(list)
             def tf_fun(a):
                 return a + 1
 
diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
index 09300059f3..42e0abe4e6 100644
--- a/python/tests/engine/test_spark.py
+++ b/python/tests/engine/test_spark.py
@@ -34,7 +34,7 @@
 from hsfs.constructor import hudi_feature_group_alias, query
 from hsfs.core import training_dataset_engine
 from hsfs.engine import spark
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 from hsfs.training_dataset_feature import TrainingDatasetFeature
 from pyspark.sql import DataFrame
 from pyspark.sql.types import (
@@ -2668,7 +2668,7 @@ def test_write_training_dataset_splits(self, mocker):
 
         spark_engine = spark.Engine()
 
-        @hopsworks_udf(int)
+        @udf(int)
         def plus_one(col1):
             return col1 + 1
 
@@ -2717,7 +2717,7 @@ def test_write_training_dataset_splits_to_df(self, mocker):
 
         spark_engine = spark.Engine()
 
-        @hopsworks_udf(int)
+        @udf(int)
         def plus_one(col1):
             return col1 + 1
 
@@ -4328,7 +4328,7 @@ def test_apply_transformation_function_single_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        @hopsworks_udf(int)
+        @udf(int)
         def plus_one(col1):
             return col1 + 1
 
@@ -4388,7 +4388,7 @@ def test_apply_transformation_function_multiple_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        @hopsworks_udf([int, int])
+        @udf([int, int])
         def plus_two(col1):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2})
 
@@ -4449,7 +4449,7 @@ def test_apply_transformation_function_multiple_input_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        @hopsworks_udf([int, int])
+        @udf([int, int])
         def test(col1, col2):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
 
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index da5c7766ed..a0a9f6864d 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -690,10 +690,11 @@
             "version": 2,
             "featurestoreId": 11,
             "hopsworksUdf":{
-              "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+              "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
               "name": "add_mean_fs",
               "outputTypes":"double",
-              "transformationFeatures":"data"
+              "transformationFeatures":"data",
+              "statisticsArgumentNames":"data1"
             }
           },
           {
@@ -701,7 +702,7 @@
             "version": 1,
             "featurestoreId": 11,
             "hopsworksUdf":{
-              "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+              "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
               "name": "add_one_fs",
               "outputTypes":"double",
               "transformationFeatures":"col1"
@@ -929,10 +930,11 @@
               "version": 2,
               "featurestoreId": 11,
               "hopsworksUdf":{
-                "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+                "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
                 "name": "add_mean_fs",
                 "outputTypes":"double",
-                "transformationFeatures":"data"
+                "transformationFeatures":"data",
+                "statisticsArgumentNames":"data1"
               }
             },
             {
@@ -940,7 +942,7 @@
               "version": 1,
               "featurestoreId": 11,
               "hopsworksUdf":{
-                "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+                "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
                 "name": "add_one_fs",
                 "outputTypes":"double",
                 "transformationFeatures":"col1"
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 169d779bd6..96fac98fc8 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -5,7 +5,7 @@
       "version": 2,
       "featurestoreId": 11,
       "hopsworksUdf":{
-        "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+        "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
         "name": "add_one_fs",
         "outputTypes":"double",
         "transformationFeatures":"col1"
@@ -18,10 +18,11 @@
       "version": 2,
       "featurestoreId": 11,
       "hopsworksUdf":{
-        "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+        "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
         "name": "add_mean_fs",
         "outputTypes":"double",
-        "transformationFeatures":"data"
+        "transformationFeatures":"data",
+        "statisticsArgumentNames":"data1"
       }
     }
   },
@@ -31,10 +32,11 @@
       "version": 2,
       "featurestoreId": 11,
       "hopsworksUdf":{
-        "sourceCode": "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return data1 + statistics_data1.mean\n",
+        "sourceCode": "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n    return data1 + statistics.data1.mean\n",
         "name": "test_func",
         "outputTypes":"string",
-        "transformationFeatures":"feature1, feature2, feature3"
+        "transformationFeatures":"feature1, feature2, feature3",
+        "statisticsArgumentNames":"data1, data2"
       }
     }
   },
@@ -44,10 +46,11 @@
       "version": 2,
       "featurestoreId": 11,
       "hopsworksUdf":{
-        "sourceCode": "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n",
+        "sourceCode": "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n",
         "name": "test_func",
         "outputTypes":"string, double",
-        "transformationFeatures":"feature1, feature2, feature3"
+        "transformationFeatures":"feature1, feature2, feature3",
+        "statisticsArgumentNames":"data1, data2"
       }
     }
   },
@@ -60,10 +63,11 @@
           "version": 2,
           "featurestoreId": 11,
           "hopsworksUdf":{
-            "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+            "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
             "name": "add_mean_fs",
             "outputTypes":"double",
-            "transformationFeatures":"data"
+            "transformationFeatures":"data",
+            "statisticsArgumentNames":"data1"
           }
         },
         {
@@ -71,7 +75,7 @@
           "version": 1,
           "featurestoreId": 11,
           "hopsworksUdf":{
-            "sourceCode": "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+            "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
             "name": "add_one_fs",
             "outputTypes":"double",
             "transformationFeatures":"col1"
@@ -89,10 +93,11 @@
           "version": 2,
           "featurestoreId": 11,
           "hopsworksUdf":{
-            "sourceCode": "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n",
+            "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
             "name": "add_mean_fs",
             "outputTypes":"double",
-            "transformationFeatures":"data"
+            "transformationFeatures":"data",
+            "statisticsArgumentNames":"data1"
           }
         }
       ]
diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py
index e8e36c0f1e..a45093126b 100644
--- a/python/tests/test_feature_view.py
+++ b/python/tests/test_feature_view.py
@@ -18,7 +18,7 @@
 from hsfs import feature_view, training_dataset_feature
 from hsfs.constructor import fs_query, query
 from hsfs.feature_store import FeatureStore
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 
 
 class TestFeatureView:
@@ -100,11 +100,11 @@ def test_from_response_json_transformation_function(self, mocker, backend_fixtur
         )
         assert (
             fv.transformation_functions[0].hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+            == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n"
         )
         assert (
             fv.transformation_functions[1].hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
+            == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
         assert len(fv.schema) == 2
         assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature)
@@ -144,7 +144,7 @@ def test_transformation_function_instances(self, mocker, backend_fixtures):
         # Act
         q = fs_query.FsQuery.from_response_json(json)
 
-        @hopsworks_udf(int)
+        @udf(int)
         def test(col1):
             return col1 + 1
 
diff --git a/python/tests/test_helpers/transformation_test_helper.py b/python/tests/test_helpers/transformation_test_helper.py
index 8b81c48fde..2a502692a1 100644
--- a/python/tests/test_helpers/transformation_test_helper.py
+++ b/python/tests/test_helpers/transformation_test_helper.py
@@ -1,5 +1,11 @@
 import pandas as pd
-from hsfs.statistics import FeatureDescriptiveStatistics
+from hsfs.transformation_statistics import TransformationStatistics
+
+
+stats_arg1 = TransformationStatistics("arg1")
+stats_arg1_arg3 = TransformationStatistics("arg1", "arg3")
+stats_arg1_arg2 = TransformationStatistics("arg1", "arg2")
+stats_arg3 = TransformationStatistics("arg3")
 
 
 def test_function():
@@ -10,7 +16,7 @@ def test_function_one_argument(arg1):
     pass
 
 
-def test_function_one_argument_with_statistics(arg1, statistics_arg1):
+def test_function_one_argument_with_statistics(arg1, statistics=stats_arg1):
     pass
 
 
@@ -19,7 +25,7 @@ def test_function_one_argument_with_typehints(arg1: pd.Series):
 
 
 def test_function_one_argument_with_statistics_and_typehints(
-    arg1: pd.Series, statistics_arg1: FeatureDescriptiveStatistics
+    arg1: pd.Series, statistics=stats_arg1
 ):
     pass
 
@@ -29,7 +35,7 @@ def test_function_multiple_argument(arg1, arg2):
 
 
 def test_function_multiple_argument_with_statistics(
-    arg1, arg2, arg3, statistics_arg1, statistics_arg3
+    arg1, arg2, arg3, statistics=stats_arg1_arg3
 ):
     pass
 
@@ -39,39 +45,25 @@ def test_function_multiple_argument_with_typehints(arg1: pd.Series, arg2: pd.Ser
 
 
 def test_function_multiple_argument_with_statistics_and_typehints(
-    arg1: pd.Series,
-    arg2: pd.Series,
-    statistics_arg1: FeatureDescriptiveStatistics,
-    statistics_arg2: FeatureDescriptiveStatistics,
+    arg1: pd.Series, arg2: pd.Series, statistics=stats_arg1_arg2
 ):
     pass
 
 
 def test_function_multiple_argument_with_mixed_statistics_and_typehints(
-    arg1: pd.Series,
-    arg2,
-    arg3,
-    statistics_arg1,
-    statistics_arg3: FeatureDescriptiveStatistics,
+    arg1: pd.Series, arg2, arg3, statistics=stats_arg1_arg3
 ):
     pass
 
 
 def test_function_multiple_argument_all_parameter_with_spaces(
-    arg1: pd.Series,
-    arg2,
-    statistics_arg1,
-    statistics_arg2: FeatureDescriptiveStatistics,
+    arg1: pd.Series, arg2, statistics=stats_arg1_arg2
 ):
     pass
 
 
 def test_function_multiple_argument_all_parameter_multiline(
-    arg1: pd.Series,
-    arg2,
-    statistics_arg1,
-    arg3,
-    statistics_arg3: FeatureDescriptiveStatistics,
+    arg1: pd.Series, arg2, arg3, statistics=stats_arg1_arg3
 ):
     pass
 
@@ -79,14 +71,11 @@ def test_function_multiple_argument_all_parameter_multiline(
 def test_function_multiple_argument_all_parameter_multiline_with_comments(
     arg1: pd.Series,  # Test Comment
     arg2,
-    statistics_arg1,  # Test Comment
-    arg3,
-    statistics_arg3: FeatureDescriptiveStatistics,
+    arg3,  # Test Comment
+    statistics=stats_arg1_arg3,  # Test Comment
 ) -> pd.DataFrame:  # Test Comment
     pass
 
 
-def test_function_statistics_invalid(
-    arg1: pd.Series, statistics_arg3: FeatureDescriptiveStatistics
-):
+def test_function_statistics_invalid(arg1: pd.Series, statistics=stats_arg3):
     pass
diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
index 04dab45309..402c1857e1 100644
--- a/python/tests/test_hopswork_udf.py
+++ b/python/tests/test_hopswork_udf.py
@@ -19,7 +19,7 @@
 import pandas as pd
 import pytest
 from hsfs.client.exceptions import FeatureStoreException
-from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, hopsworks_udf
+from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, udf
 
 
 class TestHopsworksUdf:
@@ -95,14 +95,14 @@ def test_get_module_imports(self):
             "python/tests/test_helpers/transformation_test_helper.py"
         ) == [
             "import pandas as pd",
-            "from hsfs.statistics import FeatureDescriptiveStatistics",
+            "from hsfs.transformation_statistics import TransformationStatistics",
         ]
 
     def test_extract_source_code(self):
         from test_helpers.transformation_test_helper import test_function
 
         assert """import pandas as pd
-from hsfs.statistics import FeatureDescriptiveStatistics
+from hsfs.transformation_statistics import TransformationStatistics
 def test_function():
     return True""" == HopsworksUdf._extract_source_code(test_function).strip()
 
@@ -110,8 +110,7 @@ def test_extract_function_arguments_no_arguments(self):
         from test_helpers.transformation_test_helper import test_function
 
         with pytest.raises(FeatureStoreException) as exception:
-            function_source = HopsworksUdf._extract_source_code(test_function)
-            HopsworksUdf._extract_function_arguments(function_source)
+            HopsworksUdf._extract_function_arguments(test_function)
 
         assert (
             str(exception.value)
@@ -121,8 +120,9 @@ def test_extract_function_arguments_no_arguments(self):
     def test_extract_function_arguments_one_argument(self):
         from test_helpers.transformation_test_helper import test_function_one_argument
 
-        function_source = HopsworksUdf._extract_source_code(test_function_one_argument)
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
+        function_argument = HopsworksUdf._extract_function_arguments(
+            test_function_one_argument
+        )
 
         assert function_argument == [
             TransformationFeature(feature_name="arg1", statistic_argument_name=None)
@@ -133,15 +133,12 @@ def test_extract_function_arguments_one_argument_with_statistics(self):
             test_function_one_argument_with_statistics,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_one_argument_with_statistics
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            )
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1")
         ]
 
     def test_extract_function_arguments_one_argument_with_typehint(self):
@@ -149,10 +146,9 @@ def test_extract_function_arguments_one_argument_with_typehint(self):
             test_function_one_argument_with_typehints,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_one_argument_with_typehints
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
             TransformationFeature(feature_name="arg1", statistic_argument_name=None)
@@ -165,15 +161,12 @@ def test_extract_function_arguments_one_argument_with_statistics_and_typehints(
             test_function_one_argument_with_statistics_and_typehints,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_one_argument_with_statistics_and_typehints
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            )
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1")
         ]
 
     def test_extract_function_arguments_multiple_argument(self):
@@ -181,10 +174,9 @@ def test_extract_function_arguments_multiple_argument(self):
             test_function_multiple_argument,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
             TransformationFeature(feature_name="arg1", statistic_argument_name=None),
@@ -196,19 +188,14 @@ def test_extract_function_arguments_multiple_argument_with_statistics(self):
             test_function_multiple_argument_with_statistics,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_with_statistics
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            ),
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"),
             TransformationFeature(feature_name="arg2", statistic_argument_name=None),
-            TransformationFeature(
-                feature_name="arg3", statistic_argument_name="statistics_arg3"
-            ),
+            TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"),
         ]
 
     def test_extract_function_arguments_multiple_argument_with_typehints(self):
@@ -216,10 +203,9 @@ def test_extract_function_arguments_multiple_argument_with_typehints(self):
             test_function_multiple_argument_with_typehints,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_with_typehints
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
             TransformationFeature(feature_name="arg1", statistic_argument_name=None),
@@ -233,18 +219,13 @@ def test_extract_function_arguments_multiple_argument_with_statistics_and_typehi
             test_function_multiple_argument_with_statistics_and_typehints,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_with_statistics_and_typehints
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            ),
-            TransformationFeature(
-                feature_name="arg2", statistic_argument_name="statistics_arg2"
-            ),
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"),
+            TransformationFeature(feature_name="arg2", statistic_argument_name="arg2"),
         ]
 
     def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_typehints(
@@ -254,19 +235,14 @@ def test_extract_function_arguments_multiple_argument_with_mixed_statistics_and_
             test_function_multiple_argument_with_mixed_statistics_and_typehints,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_with_mixed_statistics_and_typehints
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            ),
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"),
             TransformationFeature(feature_name="arg2", statistic_argument_name=None),
-            TransformationFeature(
-                feature_name="arg3", statistic_argument_name="statistics_arg3"
-            ),
+            TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"),
         ]
 
     def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces(
@@ -276,18 +252,13 @@ def test_extract_function_arguments_multiple_argument_all_parameter_with_spaces(
             test_function_multiple_argument_all_parameter_with_spaces,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_all_parameter_with_spaces
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            ),
-            TransformationFeature(
-                feature_name="arg2", statistic_argument_name="statistics_arg2"
-            ),
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"),
+            TransformationFeature(feature_name="arg2", statistic_argument_name="arg2"),
         ]
 
     def test_extract_function_arguments_multiple_argument_all_parameter_multiline(self):
@@ -295,19 +266,14 @@ def test_extract_function_arguments_multiple_argument_all_parameter_multiline(se
             test_function_multiple_argument_all_parameter_multiline,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_all_parameter_multiline
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            ),
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"),
             TransformationFeature(feature_name="arg2", statistic_argument_name=None),
-            TransformationFeature(
-                feature_name="arg3", statistic_argument_name="statistics_arg3"
-            ),
+            TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"),
         ]
 
     def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_with_comments(
@@ -317,19 +283,14 @@ def test_extract_function_arguments_multiple_argumen_all_parameter_multiline_wit
             test_function_multiple_argument_all_parameter_multiline_with_comments,
         )
 
-        function_source = HopsworksUdf._extract_source_code(
+        function_argument = HopsworksUdf._extract_function_arguments(
             test_function_multiple_argument_all_parameter_multiline_with_comments
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
 
         assert function_argument == [
-            TransformationFeature(
-                feature_name="arg1", statistic_argument_name="statistics_arg1"
-            ),
+            TransformationFeature(feature_name="arg1", statistic_argument_name="arg1"),
             TransformationFeature(feature_name="arg2", statistic_argument_name=None),
-            TransformationFeature(
-                feature_name="arg3", statistic_argument_name="statistics_arg3"
-            ),
+            TransformationFeature(feature_name="arg3", statistic_argument_name="arg3"),
         ]
 
     def test_extract_function_arguments_statistics_invalid(self):
@@ -338,14 +299,11 @@ def test_extract_function_arguments_statistics_invalid(self):
         )
 
         with pytest.raises(FeatureStoreException) as exception:
-            function_source = HopsworksUdf._extract_source_code(
-                test_function_statistics_invalid
-            )
-            HopsworksUdf._extract_function_arguments(function_source)
+            HopsworksUdf._extract_function_arguments(test_function_statistics_invalid)
 
         assert (
             str(exception.value)
-            == "No argument corresponding to statistics parameter 'statistics_arg3' present in function definition."
+            == "No argument corresponding to statistics parameter 'arg3' present in function definition."
         )
 
     def test_format_source_code(self):
@@ -356,13 +314,11 @@ def test_format_source_code(self):
         function_source = HopsworksUdf._extract_source_code(
             test_function_multiple_argument_all_parameter_multiline_with_comments
         )
-        function_argument = HopsworksUdf._extract_function_arguments(function_source)
-        print("\n")
-        print(function_argument)
+
         formated_source, module_imports = HopsworksUdf._format_source_code(
-            function_source, function_argument
+            function_source
         )
-        print(formated_source)
+
         assert (
             formated_source.strip()
             == """def test_function_multiple_argument_all_parameter_multiline_with_comments(arg1, arg2, arg3):
@@ -370,21 +326,21 @@ def test_format_source_code(self):
         )
 
     def test_generate_output_column_names_one_argument_one_output_type(self):
-        @hopsworks_udf(int)
+        @udf(int)
         def test_func(col1):
             return col1 + 1
 
         assert test_func._get_output_column_names() == ["test_func_col1_"]
 
     def test_generate_output_column_names_multiple_argument_one_output_type(self):
-        @hopsworks_udf(int)
+        @udf(int)
         def test_func(col1, col2, col3):
             return col1 + 1
 
         assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"]
 
     def test_generate_output_column_names_single_argument_multiple_output_type(self):
-        @hopsworks_udf([int, float, int])
+        @udf([int, float, int])
         def test_func(col1):
             return pd.DataFrame(
                 {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]}
@@ -397,7 +353,7 @@ def test_func(col1):
         ]
 
     def test_generate_output_column_names_multiple_argument_multiple_output_type(self):
-        @hopsworks_udf([int, float, int])
+        @udf([int, float, int])
         def test_func(col1, col2, col3):
             return pd.DataFrame(
                 {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
@@ -410,7 +366,7 @@ def test_func(col1, col2, col3):
         ]
 
     def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
-        @hopsworks_udf(int)
+        @udf(int)
         def test_func(col1):
             return col1 + 1
 
@@ -419,7 +375,7 @@ def test_func(col1):
     def test_create_pandas_udf_return_schema_from_list_one_argument_multiple_output_type(
         self,
     ):
-        @hopsworks_udf([int, float, str, date, datetime, time, bool])
+        @udf([int, float, str, date, datetime, time, bool])
         def test_func(col1):
             return pd.DataFrame(
                 {
@@ -438,7 +394,7 @@ def test_func(col1):
         )
 
     def test_hopsworks_wrapper_single_output(self):
-        @hopsworks_udf(int)
+        @udf(int)
         def test_func(col1):
             return col1 + 1
 
@@ -452,7 +408,7 @@ def test_func(col1):
         assert result.values.tolist() == [2, 3, 4, 5]
 
     def test_hopsworks_wrapper_multiple_output(self):
-        @hopsworks_udf([int, float])
+        @udf([int, float])
         def test_func(col1, col2):
             return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2})
 
@@ -470,7 +426,7 @@ def test_func(col1, col2):
         assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]]
 
     def test_HopsworkUDf_call_one_argument(self):
-        @hopsworks_udf(int)
+        @udf(int)
         def test_func(col1):
             return col1 + 1
 
@@ -481,23 +437,37 @@ def test_func(col1):
         assert test_func("new_feature").statistics_features == []
 
     def test_HopsworkUDf_call_one_argument_statistics(self):
-        @hopsworks_udf(int)
-        def test_func(col1, statistics_col1):
-            return col1 + statistics_col1
+        from hsfs.transformation_statistics import TransformationStatistics
+
+        stats = TransformationStatistics("col1")
+
+        @udf(int)
+        def test_func(col1, statistics=stats):
+            return col1 + statistics.col1.mean
 
         assert test_func.transformation_features == ["col1"]
         assert test_func.statistics_features == ["col1"]
+        assert test_func._statistics_argument_names == ["col1"]
 
         assert test_func("new_feature").transformation_features == ["new_feature"]
         assert test_func("new_feature").statistics_features == ["new_feature"]
+        assert test_func("new_feature")._statistics_argument_names == ["col1"]
 
     def test_HopsworkUDf_call_multiple_argument_statistics(self):
-        @hopsworks_udf(int)
-        def test_func(col1, statistics_col1, col2, col3, statistics_col3):
-            return col1 + statistics_col1
+        from hsfs.transformation_statistics import TransformationStatistics
+
+        stats = TransformationStatistics("col1", "col3")
+
+        @udf(int)
+        def test_func(col1, col2, col3, statistics=stats):
+            return col1 + statistics.col1.mean + statistics.col3.mean
 
         assert test_func.transformation_features == ["col1", "col2", "col3"]
         assert test_func.statistics_features == ["col1", "col3"]
 
         assert test_func("f1", "f2", "f3").transformation_features == ["f1", "f2", "f3"]
         assert test_func("f1", "f2", "f3").statistics_features == ["f1", "f3"]
+        assert test_func("f1", "f2", "f3")._statistics_argument_names == [
+            "col1",
+            "col3",
+        ]
diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py
index b54fbdbe6b..bfc2f125d0 100644
--- a/python/tests/test_transformation_function.py
+++ b/python/tests/test_transformation_function.py
@@ -17,7 +17,7 @@
 
 import pytest
 from hsfs.client.exceptions import FeatureStoreException
-from hsfs.hopsworks_udf import hopsworks_udf
+from hsfs.hopsworks_udf import udf
 from hsfs.transformation_function import TransformationFunction
 
 
@@ -36,13 +36,14 @@ def test_from_response_json_one_argument_no_statistics(self, backend_fixtures):
         assert tf._featurestore_id == 11
         assert tf.version == 2
         assert tf.hopsworks_udf.function_name == "add_one_fs"
-        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.return_types == ["double"]
         assert not tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == ["col1"]
         assert tf.hopsworks_udf.statistics_features == []
+        assert tf.hopsworks_udf._statistics_argument_names == []
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
+            == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
 
     def test_from_response_json_one_argument_with_statistics(self, backend_fixtures):
@@ -59,13 +60,14 @@ def test_from_response_json_one_argument_with_statistics(self, backend_fixtures)
         assert tf._featurestore_id == 11
         assert tf.version == 2
         assert tf.hopsworks_udf.function_name == "add_mean_fs"
-        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.return_types == ["double"]
         assert tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == ["data"]
         assert tf.hopsworks_udf.statistics_features == ["data"]
+        assert tf.hopsworks_udf._statistics_argument_names == ["data1"]
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+            == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n"
         )
 
     def test_from_response_json_multiple_argument_with_statistics(
@@ -84,7 +86,7 @@ def test_from_response_json_multiple_argument_with_statistics(
         assert tf._featurestore_id == 11
         assert tf.version == 2
         assert tf.hopsworks_udf.function_name == "test_func"
-        assert tf.hopsworks_udf.output_types == ["string"]
+        assert tf.hopsworks_udf.return_types == ["string"]
         assert tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == [
             "feature1",
@@ -92,9 +94,10 @@ def test_from_response_json_multiple_argument_with_statistics(
             "feature3",
         ]
         assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"]
+        assert tf.hopsworks_udf._statistics_argument_names == ["data1", "data2"]
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(str)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return data1 + statistics_data1.mean\n"
+            == "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n    return data1 + statistics.data1.mean\n"
         )
 
     def test_from_response_json_multiple_return_type_functions(self, backend_fixtures):
@@ -111,7 +114,7 @@ def test_from_response_json_multiple_return_type_functions(self, backend_fixture
         assert tf._featurestore_id == 11
         assert tf.version == 2
         assert tf.hopsworks_udf.function_name == "test_func"
-        assert tf.hopsworks_udf.output_types == ["string", "double"]
+        assert tf.hopsworks_udf.return_types == ["string", "double"]
         assert tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == [
             "feature1",
@@ -119,9 +122,10 @@ def test_from_response_json_multiple_return_type_functions(self, backend_fixture
             "feature3",
         ]
         assert tf.hopsworks_udf.statistics_features == ["feature1", "feature2"]
+        assert tf.hopsworks_udf._statistics_argument_names == ["data1", "data2"]
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(str, float)\ndef test_func(data1 : pd.Series, statistics_data1, data2, statistics_data2, data3):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n"
+            == "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n"
         )
 
     def test_from_response_json_list_empty(self, backend_fixtures):
@@ -148,13 +152,14 @@ def test_from_response_json_list(self, backend_fixtures):
         assert tf._featurestore_id == 11
         assert tf.version == 2
         assert tf.hopsworks_udf.function_name == "add_mean_fs"
-        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.return_types == ["double"]
         assert tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == ["data"]
         assert tf.hopsworks_udf.statistics_features == ["data"]
+        assert tf.hopsworks_udf._statistics_argument_names == ["data1"]
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+            == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n"
         )
 
         tf = tf_list[1]
@@ -162,13 +167,14 @@ def test_from_response_json_list(self, backend_fixtures):
         assert tf._featurestore_id == 11
         assert tf.version == 1
         assert tf.hopsworks_udf.function_name == "add_one_fs"
-        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.return_types == ["double"]
         assert not tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == ["col1"]
         assert tf.hopsworks_udf.statistics_features == []
+        assert tf.hopsworks_udf._statistics_argument_names == []
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
+            == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
 
     def test_from_response_json_list_one_argument(self, backend_fixtures):
@@ -186,13 +192,14 @@ def test_from_response_json_list_one_argument(self, backend_fixtures):
         assert tf._featurestore_id == 11
         assert tf.version == 2
         assert tf.hopsworks_udf.function_name == "add_mean_fs"
-        assert tf.hopsworks_udf.output_types == ["double"]
+        assert tf.hopsworks_udf.return_types == ["double"]
         assert tf.hopsworks_udf.statistics_required
         assert tf.hopsworks_udf.transformation_features == ["data"]
         assert tf.hopsworks_udf.statistics_features == ["data"]
+        assert tf.hopsworks_udf._statistics_argument_names == ["data1"]
         assert (
             tf.hopsworks_udf._function_source
-            == "\n@hopsworks_udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics_data1):\n    return data1 + statistics_data1.mean\n"
+            == "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n"
         )
 
     def test_transformation_function_definition_no_hopworks_udf(self):
@@ -211,7 +218,7 @@ def test(col1):
         )
 
     def test_transformation_function_definition_with_hopworks_udf(self):
-        @hopsworks_udf(int)
+        @udf(int)
         def test2(col1):
             return col1 + 1
 

From 659f2aba9e4b1f0331bf12564de959fede0bc682 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 7 Jun 2024 09:47:54 +0200
Subject: [PATCH 42/58] refactoring transformation functions to update parsing
 of statistics parameters and also renaming decorator name

---
 python/hsfs/hopsworks_udf.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index e287089545..96ef119866 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -280,7 +280,11 @@ def _parse_function_signature(source_code: str) -> Tuple[List[str], str, int, in
             ]
         )
         arg_list = signature.split("(")[1].split(")")[0].split(",")
-        arg_list = [arg.split(":")[0].split("=")[0].strip() for arg in arg_list]
+        arg_list = [
+            arg.split(":")[0].split("=")[0].strip()
+            for arg in arg_list
+            if not arg.strip() == ""
+        ]
         if "statistics" in arg_list:
             arg_list.remove("statistics")
         return arg_list, signature, signature_start_line, signature_end_line

From 0a22fd79c2c5219c84473e1256a5d58a05c58109 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 7 Jun 2024 10:13:02 +0200
Subject: [PATCH 43/58] reformating with ruff

---
 python/hsfs/core/vector_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 9a882523b6..9d39d81e09 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -1084,4 +1084,4 @@ def transformed_feature_vector_col_name(self):
                 if feature not in transformation_features
             ]
             self._transformed_feature_vector_col_name.extend(output_column_names)
-        return self._transformed_feature_vector_col_name
\ No newline at end of file
+        return self._transformed_feature_vector_col_name

From 159da54794fd90a5204399ccbb466309266d6306 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sun, 9 Jun 2024 21:29:20 +0200
Subject: [PATCH 44/58] adding statistics to udf only if required

---
 python/hsfs/hopsworks_udf.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 96ef119866..e3cd99de56 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -696,9 +696,10 @@ def transformation_statistics(
     ) -> None:
         self._statistics = TransformationStatistics(*self._statistics_argument_names)
         for stat in statistics:
-            self._statistics.set_statistics(
-                self._statistics_argument_mapping[stat.feature_name], stat.to_dict()
-            )
+            if stat.feature_name in self._statistics_argument_mapping.keys():
+                self._statistics.set_statistics(
+                    self._statistics_argument_mapping[stat.feature_name], stat.to_dict()
+                )
 
     @output_column_names.setter
     def output_column_names(self, output_col_names: Union[str, List[str]]) -> None:

From eef2cb535f9a22a11bec020e7cab2b02adfb7899 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sun, 9 Jun 2024 22:26:09 +0200
Subject: [PATCH 45/58] convrting extended statistics to dictonary

---
 python/hsfs/transformation_statistics.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py
index f4b6b1c0e5..79778d1e33 100644
--- a/python/hsfs/transformation_statistics.py
+++ b/python/hsfs/transformation_statistics.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import json
 from dataclasses import dataclass
 from typing import Any, Dict, Mapping, Optional, Union
 
@@ -86,7 +87,11 @@ def __init__(
         self.entropy = entropy
         self.uniqueness = uniqueness
         self.exact_num_distinct_values = exact_num_distinct_values
-        self.extended_statistics = extended_statistics
+        self.extended_statistics = (
+            extended_statistics
+            if not isinstance(extended_statistics, str)
+            else json.loads(extended_statistics)
+        )
 
     @classmethod
     def from_response_json(

From 50e944cf28009f9935ed6848e8cea272b2c24ae3 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Sun, 9 Jun 2024 22:31:45 +0200
Subject: [PATCH 46/58] sorting built in label encoder to maintain consistency

---
 python/hsfs/builtin_transformations.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
index 421a04cffe..9e2daa0d24 100644
--- a/python/hsfs/builtin_transformations.py
+++ b/python/hsfs/builtin_transformations.py
@@ -44,9 +44,9 @@ def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Serie
 
 @udf(int)
 def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
-    unique_data = [
-        value for value in statistics.feature.extended_statistics["unique_values"]
-    ]
+    unique_data = sorted(
+        [value for value in statistics.feature.extended_statistics["unique_values"]]
+    )
     value_to_index = {value: index for index, value in enumerate(unique_data)}
     return pd.Series(
         [value_to_index[data] if not pd.isna(data) else np.nan for data in feature]

From 7111f86710972212e59d3cdfe6457274cda5d0b1 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 13 Jun 2024 16:37:49 +0200
Subject: [PATCH 47/58] adding type hints for class TransformationStatistics

---
 python/hsfs/transformation_statistics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/hsfs/transformation_statistics.py b/python/hsfs/transformation_statistics.py
index 79778d1e33..c4a1bc20b1 100644
--- a/python/hsfs/transformation_statistics.py
+++ b/python/hsfs/transformation_statistics.py
@@ -96,7 +96,7 @@ def __init__(
     @classmethod
     def from_response_json(
         cls: FeatureTransformationStatistics, json_dict: Dict[str, Any]
-    ):
+    ) -> FeatureTransformationStatistics:
         json_decamelized = humps.decamelize(json_dict)
         return cls(**json_decamelized)
 
@@ -106,16 +106,16 @@ class TransformationStatistics:
     Class that stores statistics of all features required for a transformation function.
     """
 
-    def __init__(self, *features):
+    def __init__(self, *features: str):
         self._features = features
         self.__dict__.update(
             {feature: self.init_statistics(feature) for feature in features}
         )
 
-    def init_statistics(self, feature_name):
+    def init_statistics(self, feature_name: str) -> FeatureTransformationStatistics:
         return FeatureTransformationStatistics(feature_name=feature_name)
 
-    def set_statistics(self, feature_name, statistics: Dict[str, Any]):
+    def set_statistics(self, feature_name: str, statistics: Dict[str, Any]) -> None:
         self.__dict__[feature_name] = (
             FeatureTransformationStatistics.from_response_json(statistics)
         )

From 114a792c4b7a104794619cd656b149272ded239d Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 14 Jun 2024 10:38:45 +0200
Subject: [PATCH 48/58] adapating to backend update of reaturning output_types,
 transformation_features and statistics_argument_names as Lists

---
 python/hsfs/hopsworks_udf.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index e3cd99de56..544746dc90 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -573,17 +573,15 @@ def from_response_json(
         function_source_code = json_decamelized["source_code"]
         function_name = json_decamelized["name"]
         output_types = [
-            output_type.strip()
-            for output_type in json_decamelized["output_types"].split(",")
+            output_type.strip() for output_type in json_decamelized["output_types"]
         ]
         transformation_features = [
-            feature.strip()
-            for feature in json_decamelized["transformation_features"].split(",")
+            feature.strip() for feature in json_decamelized["transformation_features"]
         ]
         statistics_features = (
             [
                 feature.strip()
-                for feature in json_decamelized["statistics_argument_names"].split(",")
+                for feature in json_decamelized["statistics_argument_names"]
             ]
             if "statistics_argument_names" in json_decamelized
             else None

From bd4bb1f515e8e25e93bd00391b3e3b92b5ae8c04 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Fri, 14 Jun 2024 11:27:41 +0200
Subject: [PATCH 49/58] fixing unit tests

---
 python/hsfs/hopsworks_udf.py                  |  4 ++
 ...t_python_spark_transformation_functions.py | 18 ++++-----
 .../tests/fixtures/feature_view_fixtures.json | 20 +++++-----
 .../transformation_function_fixtures.json     | 38 +++++++++----------
 4 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 544746dc90..b20465a17a 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -590,6 +590,10 @@ def from_response_json(
         # Reconstructing statistics arguments.
         arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code)
 
+        transformation_features = (
+            arg_list if not transformation_features else transformation_features
+        )
+
         if statistics_features:
             transformation_features = [
                 TransformationFeature(
diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
index cf0d529611..cb1a0652b5 100644
--- a/python/tests/engine/test_python_spark_transformation_functions.py
+++ b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -157,9 +157,9 @@ def test_apply_builtin_minmax_from_backend(self, mocker):
         )
         udf_response = {
             "sourceCode": tf_fun_source,
-            "outputTypes": "double",
-            "transformationFeatures": "",
-            "statisticsArgumentNames": "feature",
+            "outputTypes": ["double"],
+            "transformationFeatures": [],
+            "statisticsArgumentNames": ["feature"],
             "name": "min_max_scaler",
         }
 
@@ -295,9 +295,9 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker):
         )
         udf_response = {
             "sourceCode": tf_fun_source,
-            "outputTypes": "double",
-            "transformationFeatures": "",
-            "statisticsArgumentNames": "feature",
+            "outputTypes": ["double"],
+            "transformationFeatures": [],
+            "statisticsArgumentNames": ["feature"],
             "name": "standard_scaler",
         }
 
@@ -437,9 +437,9 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker):
         )
         udf_response = {
             "sourceCode": tf_fun_source,
-            "outputTypes": "double",
-            "transformationFeatures": "",
-            "statisticsArgumentNames": "feature",
+            "outputTypes": ["double"],
+            "transformationFeatures": [],
+            "statisticsArgumentNames": ["feature"],
             "name": "robust_scaler",
         }
 
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index a0a9f6864d..5e229955bd 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -692,9 +692,9 @@
             "hopsworksUdf":{
               "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
               "name": "add_mean_fs",
-              "outputTypes":"double",
-              "transformationFeatures":"data",
-              "statisticsArgumentNames":"data1"
+              "outputTypes":["double"],
+              "transformationFeatures":["data"],
+              "statisticsArgumentNames":["data1"]
             }
           },
           {
@@ -704,8 +704,8 @@
             "hopsworksUdf":{
               "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
               "name": "add_one_fs",
-              "outputTypes":"double",
-              "transformationFeatures":"col1"
+              "outputTypes":["double"],
+              "transformationFeatures":["col1"]
             }
           }
       ],
@@ -932,9 +932,9 @@
               "hopsworksUdf":{
                 "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
                 "name": "add_mean_fs",
-                "outputTypes":"double",
-                "transformationFeatures":"data",
-                "statisticsArgumentNames":"data1"
+                "outputTypes":["double"],
+                "transformationFeatures":["data"],
+                "statisticsArgumentNames":["data1"]
               }
             },
             {
@@ -944,8 +944,8 @@
               "hopsworksUdf":{
                 "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
                 "name": "add_one_fs",
-                "outputTypes":"double",
-                "transformationFeatures":"col1"
+                "outputTypes":["double"],
+                "transformationFeatures":["col1"]
               }
             }
         ],
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 96fac98fc8..6fa5d762b7 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -7,8 +7,8 @@
       "hopsworksUdf":{
         "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
         "name": "add_one_fs",
-        "outputTypes":"double",
-        "transformationFeatures":"col1"
+        "outputTypes":["double"],
+        "transformationFeatures":["col1"]
       }
     }
   },
@@ -20,9 +20,9 @@
       "hopsworksUdf":{
         "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
         "name": "add_mean_fs",
-        "outputTypes":"double",
-        "transformationFeatures":"data",
-        "statisticsArgumentNames":"data1"
+        "outputTypes":["double"],
+        "transformationFeatures":["data"],
+        "statisticsArgumentNames":["data1"]
       }
     }
   },
@@ -34,9 +34,9 @@
       "hopsworksUdf":{
         "sourceCode": "\n@udf(str)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n    return data1 + statistics.data1.mean\n",
         "name": "test_func",
-        "outputTypes":"string",
-        "transformationFeatures":"feature1, feature2, feature3",
-        "statisticsArgumentNames":"data1, data2"
+        "outputTypes":["string"],
+        "transformationFeatures":["feature1", "feature2", "feature3"],
+        "statisticsArgumentNames":["data1", "data2"]
       }
     }
   },
@@ -48,9 +48,9 @@
       "hopsworksUdf":{
         "sourceCode": "\n@udf(str, float)\ndef test_func(data1 : pd.Series, data2, data3, statistics=stats):\n    return pd.DataFrame('col1': ['a', 'b'], 'col2':[1,2])\n",
         "name": "test_func",
-        "outputTypes":"string, double",
-        "transformationFeatures":"feature1, feature2, feature3",
-        "statisticsArgumentNames":"data1, data2"
+        "outputTypes":["string", "double"],
+        "transformationFeatures":["feature1", "feature2", "feature3"],
+        "statisticsArgumentNames":["data1", "data2"]
       }
     }
   },
@@ -65,9 +65,9 @@
           "hopsworksUdf":{
             "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
             "name": "add_mean_fs",
-            "outputTypes":"double",
-            "transformationFeatures":"data",
-            "statisticsArgumentNames":"data1"
+            "outputTypes":["double"],
+            "transformationFeatures":["data"],
+            "statisticsArgumentNames":["data1"]
           }
         },
         {
@@ -77,8 +77,8 @@
           "hopsworksUdf":{
             "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
             "name": "add_one_fs",
-            "outputTypes":"double",
-            "transformationFeatures":"col1"
+            "outputTypes":["double"],
+            "transformationFeatures":["col1"]
           }
         }
       ]
@@ -95,9 +95,9 @@
           "hopsworksUdf":{
             "sourceCode": "\n@udf(float)\ndef add_mean_fs(data1 : pd.Series, statistics=stats):\n    return data1 + statistics.data1.mean\n",
             "name": "add_mean_fs",
-            "outputTypes":"double",
-            "transformationFeatures":"data",
-            "statisticsArgumentNames":"data1"
+            "outputTypes":["double"],
+            "transformationFeatures":["data"],
+            "statisticsArgumentNames":["data1"]
           }
         }
       ]

From 64f34cdd4e4d519740ffda4baabf88d13d5db477 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 17 Jun 2024 09:17:46 +0200
Subject: [PATCH 50/58] removign space in doc string

---
 python/hsfs/hopsworks_udf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index b20465a17a..83f6e1620f 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -44,9 +44,9 @@ def udf(return_type: Union[List[type], type]) -> "HopsworksUdf":
 
     !!! example
         ```python
-        from hsfs.hopsworks_udf import udf
+        from hopsworks import udf
 
-       @udf(float)
+        @udf(float)
         def add_one(data1 : pd.Series):
             return data1 + 1
         ```

From 9891900452fbf6e78f6e12c09d1e11bfeaeadd7d Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 17 Jun 2024 10:53:42 +0200
Subject: [PATCH 51/58] replace - from output column names with _

---
 python/hsfs/hopsworks_udf.py       |  2 +-
 python/tests/engine/test_python.py | 12 ++++++------
 python/tests/engine/test_spark.py  | 30 +++++++++++++++---------------
 python/tests/test_hopswork_udf.py  | 10 +++++-----
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 83f6e1620f..b9f8bde5bb 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -366,7 +366,7 @@ def _get_output_column_names(self) -> str:
             `List[str]`: List of feature names for the transformed columns
         """
         _BASE_COLUMN_NAME = (
-            f'{self.function_name}_{"-".join(self.transformation_features)}_'
+            f'{self.function_name}_{"_".join(self.transformation_features)}_'
         )
         if len(self.return_types) > 1:
             return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))]
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index 4796ad2cfe..07958686de 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -3354,12 +3354,12 @@ def plus_two(col1, col2):
         )
 
         # Assert
-        assert all(result.columns == ["plus_two_col1-col2_0", "plus_two_col1-col2_1"])
+        assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"])
         assert len(result) == 2
-        assert result["plus_two_col1-col2_0"][0] == 2
-        assert result["plus_two_col1-col2_0"][1] == 3
-        assert result["plus_two_col1-col2_1"][0] == 12
-        assert result["plus_two_col1-col2_1"][1] == 13
+        assert result["plus_two_col1_col2_0"][0] == 2
+        assert result["plus_two_col1_col2_0"][1] == 3
+        assert result["plus_two_col1_col2_1"][0] == 12
+        assert result["plus_two_col1_col2_1"][1] == 13
 
     def test_apply_transformation_function_polars(self, mocker):
         # Arrange
@@ -3854,7 +3854,7 @@ def test_materialization_kafka_first_job_execution(self, mocker):
             args="defaults tests_offsets",
             await_termination=False,
         )
-    
+
     def test_materialization_kafka_skip_offsets(self, mocker):
         # Arrange
         mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={})
diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
index 42e0abe4e6..322716da5b 100644
--- a/python/tests/engine/test_spark.py
+++ b/python/tests/engine/test_spark.py
@@ -5,7 +5,7 @@
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #
-#       http://www.apache.org/licenses/LICENSE-2.0
+#       http://www.apache.org/licenses/LICENSE_2.0
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
@@ -937,7 +937,7 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures):
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -1053,7 +1053,7 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures):
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == "/Projects/test_project_name/Resources/test_query_name-checkpoint"
+            == "/Projects/test_project_name/Resources/test_query_name_checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -1293,7 +1293,7 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures)
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -2456,7 +2456,7 @@ def test_time_series_split_date(self, mocker):
         d = {
             "col_0": [1, 2],
             "col_1": ["test_1", "test_2"],
-            "event_time": ["2017-03-04", "2017-03-05"],
+            "event_time": ["2017_03_04", "2017_03_05"],
         }
         df = pd.DataFrame(data=d)
 
@@ -2516,7 +2516,7 @@ def test_time_series_split_timestamp(self, mocker):
         d = {
             "col_0": [1, 2],
             "col_1": ["test_1", "test_2"],
-            "event_time": ["2017-03-04", "2017-03-05"],
+            "event_time": ["2017_03_04", "2017_03_05"],
         }
         df = pd.DataFrame(data=d)
 
@@ -3809,7 +3809,7 @@ def __init__(self, label, index):
             "double": ["1"],
             "timestamp": [1641340800000],
             "boolean": ["False"],
-            "date": ["2022-01-27"],
+            "date": ["2022_01_27"],
             "binary": ["1"],
             "array<string>": [["123"]],
             "struc": [LabelIndex("0", "1")],
@@ -4212,11 +4212,11 @@ def test_setup_s3_hadoop_conf(self, mocker):
             "fs.s3a.secret.key", s3_connector.secret_key
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
-            "fs.s3a.server-side-encryption-algorithm",
+            "fs.s3a.server_side_encryption_algorithm",
             s3_connector.server_encryption_algorithm,
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
-            "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key
+            "fs.s3a.server_side_encryption_key", s3_connector.server_encryption_key
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.s3a.aws.credentials.provider",
@@ -4487,8 +4487,8 @@ def test(col1, col2):
         expected_df = pd.DataFrame(
             data={
                 "col_1": ["test_1", "test_2"],
-                "test_col_0-col_2_0": [2, 3],
-                "test_col_0-col_2_1": [12, 13],
+                "test_col_0_col_2_0": [2, 3],
+                "test_col_0_col_2_1": [12, 13],
             }
         )  # todo why it doesnt return int?
 
@@ -4514,7 +4514,7 @@ def test_setup_gcp_hadoop_conf(self, mocker):
 
         content = (
             '{"type": "service_account", "project_id": "test", "private_key_id": "123456", '
-            '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", '
+            '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", '
             '"client_email": "test@project.iam.gserviceaccount.com"}'
         )
         credentialsFile = "keyFile.json"
@@ -4563,7 +4563,7 @@ def test_setup_gcp_hadoop_conf(self, mocker):
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.gs.auth.service.account.private.key",
-            "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----",
+            "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____",
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.unset.assert_any_call(
             "fs.gs.encryption.algorithm"
@@ -4586,7 +4586,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker):
 
         content = (
             '{"type": "service_account", "project_id": "test", "private_key_id": "123456", '
-            '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", '
+            '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", '
             '"client_email": "test@project.iam.gserviceaccount.com"}'
         )
         credentialsFile = "keyFile.json"
@@ -4650,7 +4650,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker):
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.gs.auth.service.account.private.key",
-            "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----",
+            "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____",
         )
 
     def test_get_unique_values(self):
diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
index 402c1857e1..8494d018f1 100644
--- a/python/tests/test_hopswork_udf.py
+++ b/python/tests/test_hopswork_udf.py
@@ -337,7 +337,7 @@ def test_generate_output_column_names_multiple_argument_one_output_type(self):
         def test_func(col1, col2, col3):
             return col1 + 1
 
-        assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"]
+        assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"]
 
     def test_generate_output_column_names_single_argument_multiple_output_type(self):
         @udf([int, float, int])
@@ -360,9 +360,9 @@ def test_func(col1, col2, col3):
             )
 
         assert test_func._get_output_column_names() == [
-            "test_func_col1-col2-col3_0",
-            "test_func_col1-col2-col3_1",
-            "test_func_col1-col2-col3_2",
+            "test_func_col1_col2_col3_0",
+            "test_func_col1_col2_col3_1",
+            "test_func_col1_col2_col3_2",
         ]
 
     def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
@@ -422,7 +422,7 @@ def test_func(col1, col2):
             test_dataframe["column1"], test_dataframe["column2"]
         )
 
-        assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"])
+        assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"])
         assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]]
 
     def test_HopsworkUDf_call_one_argument(self):

From 6ebd9f41e2ff330c3c36353da6eb4c5b69d5d3e1 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 17 Jun 2024 11:03:11 +0200
Subject: [PATCH 52/58] revreting unwanted spark test _ replace changes

---
 python/tests/engine/test_spark.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
index 322716da5b..7eabd38d07 100644
--- a/python/tests/engine/test_spark.py
+++ b/python/tests/engine/test_spark.py
@@ -5,7 +5,7 @@
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #
-#       http://www.apache.org/licenses/LICENSE_2.0
+#       http://www.apache.org/licenses/LICENSE-2.0
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
@@ -937,7 +937,7 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures):
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint"
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -1053,7 +1053,7 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures):
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == "/Projects/test_project_name/Resources/test_query_name_checkpoint"
+            == "/Projects/test_project_name/Resources/test_query_name-checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -1293,7 +1293,7 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures)
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint"
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -2456,7 +2456,7 @@ def test_time_series_split_date(self, mocker):
         d = {
             "col_0": [1, 2],
             "col_1": ["test_1", "test_2"],
-            "event_time": ["2017_03_04", "2017_03_05"],
+            "event_time": ["2017-03-04", "2017-03-05"],
         }
         df = pd.DataFrame(data=d)
 
@@ -2516,7 +2516,7 @@ def test_time_series_split_timestamp(self, mocker):
         d = {
             "col_0": [1, 2],
             "col_1": ["test_1", "test_2"],
-            "event_time": ["2017_03_04", "2017_03_05"],
+            "event_time": ["2017-03-04", "2017-03-05"],
         }
         df = pd.DataFrame(data=d)
 
@@ -3809,7 +3809,7 @@ def __init__(self, label, index):
             "double": ["1"],
             "timestamp": [1641340800000],
             "boolean": ["False"],
-            "date": ["2022_01_27"],
+            "date": ["2022-01-27"],
             "binary": ["1"],
             "array<string>": [["123"]],
             "struc": [LabelIndex("0", "1")],
@@ -4212,11 +4212,11 @@ def test_setup_s3_hadoop_conf(self, mocker):
             "fs.s3a.secret.key", s3_connector.secret_key
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
-            "fs.s3a.server_side_encryption_algorithm",
+            "fs.s3a.server-side-encryption-algorithm",
             s3_connector.server_encryption_algorithm,
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
-            "fs.s3a.server_side_encryption_key", s3_connector.server_encryption_key
+            "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.s3a.aws.credentials.provider",
@@ -4514,7 +4514,7 @@ def test_setup_gcp_hadoop_conf(self, mocker):
 
         content = (
             '{"type": "service_account", "project_id": "test", "private_key_id": "123456", '
-            '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", '
+            '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", '
             '"client_email": "test@project.iam.gserviceaccount.com"}'
         )
         credentialsFile = "keyFile.json"
@@ -4563,7 +4563,7 @@ def test_setup_gcp_hadoop_conf(self, mocker):
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.gs.auth.service.account.private.key",
-            "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____",
+            "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----",
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.unset.assert_any_call(
             "fs.gs.encryption.algorithm"
@@ -4586,7 +4586,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker):
 
         content = (
             '{"type": "service_account", "project_id": "test", "private_key_id": "123456", '
-            '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", '
+            '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", '
             '"client_email": "test@project.iam.gserviceaccount.com"}'
         )
         credentialsFile = "keyFile.json"
@@ -4650,7 +4650,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker):
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.gs.auth.service.account.private.key",
-            "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____",
+            "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----",
         )
 
     def test_get_unique_values(self):

From c0202101c913b3362327976ce3c70ff97ebe1108 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 1 Jul 2024 15:43:59 +0200
Subject: [PATCH 53/58] on-deamnd tranformations working

---
 python/hsfs/builtin_transformations.py        |  10 +-
 python/hsfs/core/feature_group_api.py         |  16 +-
 python/hsfs/core/feature_group_engine.py      |   8 +-
 python/hsfs/core/feature_view_api.py          |  24 +--
 python/hsfs/core/feature_view_engine.py       |  23 ---
 .../core/transformation_function_engine.py    |  23 +--
 python/hsfs/core/vector_server.py             | 139 ++++++++++++--
 python/hsfs/engine/python.py                  |  40 +++-
 python/hsfs/engine/spark.py                   |  49 ++++-
 python/hsfs/feature.py                        |  12 ++
 python/hsfs/feature_group.py                  | 100 +++++++++-
 python/hsfs/feature_store.py                  |  10 +
 python/hsfs/feature_view.py                   |  21 ++-
 python/hsfs/hopsworks_udf.py                  | 177 ++++++++++++++++--
 python/hsfs/training_dataset_feature.py       |  26 ++-
 python/hsfs/transformation_function.py        |  12 +-
 python/tests/test_hopswork_udf.py             |  40 +++-
 17 files changed, 589 insertions(+), 141 deletions(-)

diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
index 9e2daa0d24..ae24cd4274 100644
--- a/python/hsfs/builtin_transformations.py
+++ b/python/hsfs/builtin_transformations.py
@@ -23,26 +23,26 @@
 feature_statistics = TransformationStatistics("feature")
 
 
-@udf(float)
+@udf(float, drop=["feature"])
 def min_max_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     return (feature - statistics.feature.min) / (
         statistics.feature.max - statistics.feature.min
     )
 
 
-@udf(float)
+@udf(float, drop=["feature"])
 def standard_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     return (feature - statistics.feature.mean) / statistics.feature.stddev
 
 
-@udf(float)
+@udf(float, drop=["feature"])
 def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     return (feature - statistics.feature.percentiles[49]) / (
         statistics.feature.percentiles[74] - statistics.feature.percentiles[24]
     )
 
 
-@udf(int)
+@udf(int, drop=["feature"])
 def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     unique_data = sorted(
         [value for value in statistics.feature.extended_statistics["unique_values"]]
@@ -53,7 +53,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie
     )
 
 
-@udf(bool)
+@udf(bool, drop=["feature"])
 def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series:
     unique_data = [
         value for value in statistics.feature.extended_statistics["unique_values"]
diff --git a/python/hsfs/core/feature_group_api.py b/python/hsfs/core/feature_group_api.py
index 11fdbbbdc6..c6b0a1a70f 100644
--- a/python/hsfs/core/feature_group_api.py
+++ b/python/hsfs/core/feature_group_api.py
@@ -51,6 +51,9 @@ def save(
             feature_group_instance.feature_store_id,
             "featuregroups",
         ]
+        query_params = {
+            "expand": ["features", "expectationsuite", "transformationfunctions"]
+        }
         headers = {"content-type": "application/json"}
         feature_group_object = feature_group_instance.update_from_response_json(
             _client._send_request(
@@ -58,6 +61,7 @@ def save(
                 path_params,
                 headers=headers,
                 data=feature_group_instance.json(),
+                query_params=query_params,
             ),
         )
         return feature_group_object
@@ -93,7 +97,11 @@ def get(
             "featuregroups",
             name,
         ]
-        query_params = None if version is None else {"version": version}
+        query_params = {
+            "expand": ["features", "expectationsuite", "transformationfunctions"]
+        }
+        if version is not None:
+            query_params["version"] = version
 
         fg_objs = []
         # In principle unique names are enforced across fg type and this should therefore
@@ -157,8 +165,10 @@ def get_by_id(
             "featuregroups",
             feature_group_id,
         ]
-
-        fg_json = _client._send_request("GET", path_params)
+        query_params = {
+            "expand": ["features", "expectationsuite", "transformationfunctions"]
+        }
+        fg_json = _client._send_request("GET", path_params, query_params)
         if (
             fg_json["type"] == FeatureGroupApi.BACKEND_FG_STREAM
             or fg_json["type"] == FeatureGroupApi.BACKEND_FG_BATCH
diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py
index 3e88805eda..010810f6cc 100644
--- a/python/hsfs/core/feature_group_engine.py
+++ b/python/hsfs/core/feature_group_engine.py
@@ -88,7 +88,9 @@ def insert(
         validation_options: dict = None,
     ):
         dataframe_features = engine.get_instance().parse_schema_feature_group(
-            feature_dataframe, feature_group.time_travel_format
+            feature_dataframe,
+            feature_group.time_travel_format,
+            feature_group.transformation_functions,
         )
         util.validate_embedding_feature_type(
             feature_group.embedding_index, dataframe_features
@@ -281,7 +283,9 @@ def insert_stream(
             )
 
         dataframe_features = engine.get_instance().parse_schema_feature_group(
-            dataframe, feature_group.time_travel_format
+            dataframe,
+            feature_group.time_travel_format,
+            feature_group.transformation_functions,
         )
         util.validate_embedding_feature_type(
             feature_group.embedding_index, dataframe_features
diff --git a/python/hsfs/core/feature_view_api.py b/python/hsfs/core/feature_view_api.py
index 1bc6b46115..50355f3d5f 100644
--- a/python/hsfs/core/feature_view_api.py
+++ b/python/hsfs/core/feature_view_api.py
@@ -17,7 +17,7 @@
 
 from typing import List, Optional, Union
 
-from hsfs import client, feature_view, training_dataset, transformation_function
+from hsfs import client, feature_view, training_dataset
 from hsfs.client.exceptions import RestAPIError
 from hsfs.constructor import query, serving_prepared_statement
 from hsfs.core import explicit_provenance, job, training_dataset_job_conf
@@ -206,28 +206,6 @@ def get_serving_prepared_statement(
             self._client._send_request("GET", path, query_params, headers=headers)
         )
 
-    def get_attached_transformation_fn(
-        self, name: str, version: int
-    ) -> List["transformation_function.TransformationFunction"]:
-        """
-        Get transformation functions attached to a feature view form the backend
-
-        # Arguments
-            name `str`: Name of feature view.
-            version `ìnt`: Version of feature view.
-
-        # Returns
-            `List[TransformationFunction]` : List of transformation functions attached to the feature view.
-
-        # Raises
-            `RestAPIError`: If the feature view cannot be found from the backend.
-            `ValueError`: If the feature group associated with the feature view cannot be found.
-        """
-        path = self._base_path + [name, self._VERSION, version, self._TRANSFORMATION]
-        return transformation_function.TransformationFunction.from_response_json(
-            self._client._send_request("GET", path)
-        )
-
     def create_training_dataset(
         self,
         name: str,
diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
index 070be9b821..f85529163f 100644
--- a/python/hsfs/core/feature_view_engine.py
+++ b/python/hsfs/core/feature_view_engine.py
@@ -25,7 +25,6 @@
     feature_group,
     feature_view,
     training_dataset_feature,
-    transformation_function,
     util,
 )
 from hsfs.client import exceptions
@@ -265,28 +264,6 @@ def get_batch_query_string(
             return fs_query.pit_query
         return fs_query.query
 
-    def get_attached_transformation_fn(
-        self, name: str, version: int
-    ) -> List[transformation_function.TransformationFunction]:
-        """
-        Get transformation functions attached to a feature view form the backend
-
-        # Arguments
-            name `str`: Name of feature view.
-            version `ìnt`: Version of feature view.
-
-        # Returns
-            `List[TransformationFunction]` : List of transformation functions attached to the feature view.
-
-        # Raises
-            `RestAPIError`: If the feature view cannot be found from the backend.
-            `ValueError`: If the feature group associated with the feature view cannot be found.
-        """
-        transformation_functions = (
-            self._feature_view_api.get_attached_transformation_fn(name, version)
-        )
-        return transformation_functions
-
     def create_training_dataset(
         self,
         feature_view_obj,
diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index ec5de0810b..6bdbff13c9 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -147,21 +147,12 @@ def get_ready_to_use_transformation_fns(
         feature_view: feature_view.FeatureView,
         training_dataset_version: Optional[int] = None,
     ) -> List[transformation_function.TransformationFunction]:
-        # get attached transformation functions
-        transformation_functions = (
-            feature_view._feature_view_engine.get_attached_transformation_fn(
-                feature_view.name, feature_view.version
-            )
-        )
-
-        transformation_functions = (
-            [transformation_functions]
-            if not isinstance(transformation_functions, list)
-            else transformation_functions
-        )
-
+        # check if transformation functions require statistics
         is_stat_required = any(
-            [tf.hopsworks_udf.statistics_required for tf in transformation_functions]
+            [
+                tf.hopsworks_udf.statistics_required
+                for tf in feature_view.transformation_functions
+            ]
         )
         if not is_stat_required:
             td_tffn_stats = None
@@ -188,11 +179,11 @@ def get_ready_to_use_transformation_fns(
             )
 
         if is_stat_required:
-            for transformation_function in transformation_functions:
+            for transformation_function in feature_view.transformation_functions:
                 transformation_function.hopsworks_udf.transformation_statistics = (
                     td_tffn_stats.feature_descriptive_statistics
                 )
-        return feature_view._sort_transformation_functions(transformation_functions)
+        return feature_view.transformation_functions
 
     @staticmethod
     def compute_and_set_feature_statistics(
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
index 9d39d81e09..403cbb2522 100755
--- a/python/hsfs/core/vector_server.py
+++ b/python/hsfs/core/vector_server.py
@@ -107,7 +107,10 @@ def __init__(
         self._transformation_function_engine = (
             tf_engine_mod.TransformationFunctionEngine(feature_store_id)
         )
-        self._transformation_functions: List[
+        self._model_dependent_transformation_functions: List[
+            transformation_function.TransformationFunction
+        ] = []
+        self._on_demand_transformation_functions: List[
             transformation_function.TransformationFunction
         ] = []
         self._sql_client = None
@@ -183,13 +186,23 @@ def init_batch_scoring(
 
     def init_transformation(
         self,
-        entity: Union[feature_view.FeatureView],
+        entity: feature_view.FeatureView,
     ):
         # attach transformation functions
-        self._transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns(
+        self._model_dependent_transformation_functions = tf_engine_mod.TransformationFunctionEngine.get_ready_to_use_transformation_fns(
             entity,
             self._training_dataset_version,
         )
+        self._on_demand_transformation_functions = [
+            feature.on_demand_transformation_function
+            for feature in entity.features
+            if feature.on_demand_transformation_function
+        ]
+        self._on_demand_feature_names = [
+            feature.name
+            for feature in entity.features
+            if feature.on_demand_transformation_function
+        ]
 
     def setup_sql_client(
         self,
@@ -242,6 +255,7 @@ def get_feature_vector(
         allow_missing: bool = False,
         force_rest_client: bool = False,
         force_sql_client: bool = False,
+        request_parameters: Optional[Dict[str, Any]] = None,
     ) -> Union[pd.DataFrame, pl.DataFrame, np.ndarray, List[Any], Dict[str, Any]]:
         """Assembles serving vector from online feature store."""
         online_client_choice = self.which_client_and_ensure_initialised(
@@ -273,8 +287,8 @@ def get_feature_vector(
             vector_db_result=vector_db_features or {},
             allow_missing=allow_missing,
             client=online_client_choice,
+            request_parameters=request_parameters,
         )
-
         return self.handle_feature_vector_return_type(
             vector, batch=False, inference_helper=False, return_type=return_type
         )
@@ -287,6 +301,7 @@ def get_feature_vectors(
         ] = None,
         passed_features: Optional[List[Dict[str, Any]]] = None,
         vector_db_features: Optional[List[Dict[str, Any]]] = None,
+        request_parameters: Optional[List[Dict[str, Any]]] = None,
         allow_missing: bool = False,
         force_rest_client: bool = False,
         force_sql_client: bool = False,
@@ -305,6 +320,12 @@ def get_feature_vectors(
             or len(vector_db_features) == 0
             or len(vector_db_features) == len(entries)
         ), "Vector DB features should be None, empty or have the same length as the entries"
+        assert (
+            request_parameters is None
+            or len(request_parameters) == 0
+            or isinstance(request_parameters, dict)
+            or len(request_parameters) == len(entries)
+        ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries"
 
         online_client_choice = self.which_client_and_ensure_initialised(
             force_rest_client=force_rest_client, force_sql_client=force_sql_client
@@ -347,14 +368,23 @@ def get_feature_vectors(
             skipped_empty_entries.pop(0) if len(skipped_empty_entries) > 0 else None
         )
         vectors = []
+
+        # If request parameter is a dictionary then copy it to list with the same length as that of entires
+        request_parameters = (
+            [request_parameters] * len(entries)
+            if isinstance(request_parameters, dict)
+            else request_parameters
+        )
         for (
             idx,
             passed_values,
             vector_db_result,
+            request_parameter,
         ) in itertools.zip_longest(
             range(len(entries)),
             passed_features or [],
             vector_db_features or [],
+            request_parameters or [],
             fillvalue=None,
         ):
             if next_skipped == idx:
@@ -374,6 +404,7 @@ def get_feature_vectors(
                 vector_db_result=vector_db_result,
                 allow_missing=allow_missing,
                 client=online_client_choice,
+                request_parameters=request_parameter,
             )
 
             if vector is not None:
@@ -390,6 +421,7 @@ def assemble_feature_vector(
         vector_db_result: Optional[Dict[str, Any]],
         allow_missing: bool,
         client: Literal["rest", "sql"],
+        request_parameters: Optional[Dict[str, Any]] = None,
     ) -> Optional[List[Any]]:
         """Assembles serving vector from online feature store."""
         # Errors in batch requests are returned as None values
@@ -404,9 +436,52 @@ def assemble_feature_vector(
             _logger.debug("Updating with passed features: %s", passed_values)
             result_dict.update(passed_values)
 
-        missing_features = set(self.feature_vector_col_name).difference(
-            result_dict.keys()
+        missing_features = (
+            set(self.feature_vector_col_name)
+            .difference(result_dict.keys())
+            .difference(self._on_demand_feature_names)
         )
+
+        # TODO : Optimize this
+        request_parameters = {} if not request_parameters else request_parameters
+        available_parameters = set((result_dict | request_parameters).keys())
+        missing_request_parameters_features = {}
+
+        for on_demand_feature, on_demand_transformation in zip(
+            self._on_demand_feature_names, self._on_demand_transformation_functions
+        ):
+            missing_request_parameter = (
+                set(on_demand_transformation.hopsworks_udf.transformation_features)
+                - available_parameters
+            )
+            if missing_request_parameter:
+                missing_request_parameters_features[on_demand_feature] = sorted(
+                    list(
+                        set(
+                            on_demand_transformation.hopsworks_udf.transformation_features
+                        )
+                        - available_parameters
+                    )
+                )
+
+        if missing_request_parameters_features:
+            error = "Missing Request parameters to compute the following the on-demand Features:\n"
+            for (
+                feature,
+                missing_request_parameter,
+            ) in missing_request_parameters_features.items():
+                missing_request_parameter = "', '".join(missing_request_parameter)
+                error += f"On-Demand Feature '{feature}' requires features '{missing_request_parameter}'\n"
+            error += (
+                "Possible reasons: "
+                "1. There is no match in the given entry."
+                " Please check if the entry exists in the online feature store"
+                " or provide the feature as passed_feature. "
+                f"2. Required entries [{', '.join(self.required_serving_keys)}] or "
+                f"[{', '.join(set(sk.feature_name for sk in self._serving_keys))}] are not provided."
+            )
+            raise exceptions.FeatureStoreException(error)
+
         # for backward compatibility, before 3.4, if result is empty,
         # instead of throwing error, it skips the result
         # Maybe we drop this behaviour for 4.0
@@ -426,8 +501,11 @@ def assemble_feature_vector(
 
         if len(self.return_feature_value_handlers) > 0:
             self.apply_return_value_handlers(result_dict, client=client)
-        if len(self.transformation_functions) > 0:
-            self.apply_transformation(result_dict)
+        if (
+            len(self.model_dependent_transformation_functions) > 0
+            or len(self.on_demand_transformation_functions) > 0
+        ):
+            self.apply_transformation(result_dict, request_parameters)
 
         _logger.debug("Assembled and transformed dict feature vector: %s", result_dict)
 
@@ -473,17 +551,19 @@ def handle_feature_vector_return_type(
                 return pd.DataFrame([feature_vectorz])
             elif batch:
                 return pd.DataFrame(
-                    feature_vectorz, columns=self._feature_vector_col_name
+                    feature_vectorz, columns=self.transformed_feature_vector_col_name
                 )
             else:
                 pandas_df = pd.DataFrame(feature_vectorz).transpose()
-                pandas_df.columns = self._feature_vector_col_name
+                pandas_df.columns = self.transformed_feature_vector_col_name
                 return pandas_df
         elif return_type.lower() == "polars":
             _logger.debug("Returning feature vector as polars dataframe")
             return pl.DataFrame(
                 feature_vectorz if batch else [feature_vectorz],
-                schema=self._feature_vector_col_name if not inference_helper else None,
+                schema=self.transformed_feature_vector_col_name
+                if not inference_helper
+                else None,
                 orient="row",
             )
         else:
@@ -630,9 +710,24 @@ def _set_default_client(
             self.default_client = self.DEFAULT_SQL_CLIENT
             self._init_sql_client = True
 
-    def apply_transformation(self, row_dict: dict):
-        _logger.debug("Applying transformation functions.")
-        for tf in self.transformation_functions:
+    def apply_transformation(self, row_dict: dict, request_parameter: Dict[str, Any]):
+        _logger.debug("Applying On-Demand transformation functions.")
+        for tf in self._on_demand_transformation_functions:
+            # Check if feature provided as request parameter if not get it from retrieved feature vector.
+            features = [
+                pd.Series(request_parameter[feature])
+                if feature in request_parameter.keys()
+                else pd.Series(row_dict[feature])
+                for feature in tf.hopsworks_udf.transformation_features
+            ]
+            on_demand_feature = tf.hopsworks_udf.get_udf(force_python_udf=True)(
+                *features
+            )  # Get only python compatible UDF irrespective of engine
+
+            row_dict[on_demand_feature.name] = on_demand_feature.values[0]
+
+        _logger.debug("Applying Model-Dependent transformation functions.")
+        for tf in self.model_dependent_transformation_functions:
             features = [
                 pd.Series(row_dict[feature])
                 for feature in tf.hopsworks_udf.transformation_features
@@ -995,10 +1090,16 @@ def per_serving_key_features(self) -> Dict[str, set[str]]:
         return self._per_serving_key_features
 
     @property
-    def transformation_functions(
+    def model_dependent_transformation_functions(
+        self,
+    ) -> Optional[List[transformation_function.TransformationFunction]]:
+        return self._model_dependent_transformation_functions
+
+    @property
+    def on_demand_transformation_functions(
         self,
-    ) -> Optional[List[transformation_functions.TransformationFunction]]:
-        return self._transformation_functions
+    ) -> Optional[List[transformation_function.TransformationFunction]]:
+        return self._on_demand_transformation_functions
 
     @property
     def return_feature_value_handlers(self) -> Dict[str, Callable]:
@@ -1070,7 +1171,9 @@ def transformed_feature_vector_col_name(self):
         if self._transformed_feature_vector_col_name is None:
             transformation_features = []
             output_column_names = []
-            for transformation_function in self._transformation_functions:
+            for (
+                transformation_function
+            ) in self._model_dependent_transformation_functions:
                 transformation_features += (
                     transformation_function.hopsworks_udf.transformation_features
                 )
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index cc50428632..fea3dd0301 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -804,6 +804,9 @@ def parse_schema_feature_group(
         self,
         dataframe: Union[pd.DataFrame, pl.DataFrame],
         time_travel_format: Optional[str] = None,
+        transformation_functions: Optional[
+            List[transformation_function.TransformationFunction]
+        ] = None,
     ) -> List[feature.Feature]:
         if isinstance(dataframe, pd.DataFrame):
             arrow_schema = pa.Schema.from_pandas(dataframe, preserve_index=False)
@@ -812,6 +815,19 @@ def parse_schema_feature_group(
         ):
             arrow_schema = dataframe.to_arrow().schema
         features = []
+        transformed_features = []
+        dropped_features = []
+
+        if transformation_functions:
+            for tf in transformation_functions:
+                transformed_features.append(
+                    feature.Feature(
+                        tf.hopsworks_udf.output_column_names[0],
+                        tf.hopsworks_udf.return_types[0],
+                        on_demand=True,
+                    )
+                )
+                dropped_features.extend(tf.hopsworks_udf.dropped_features)
         for feat_name in arrow_schema.names:
             name = util.autofix_feature_name(feat_name)
             try:
@@ -820,8 +836,10 @@ def parse_schema_feature_group(
                 )
             except ValueError as e:
                 raise FeatureStoreException(f"Feature '{name}': {str(e)}") from e
-            features.append(feature.Feature(name, converted_type))
-        return features
+            if name not in dropped_features:
+                features.append(feature.Feature(name, converted_type))
+
+        return features + transformed_features
 
     def parse_schema_training_dataset(
         self, dataframe: Union[pd.DataFrame, pl.DataFrame]
@@ -842,6 +860,11 @@ def save_dataframe(
         online_write_options: Dict[str, Any],
         validation_id: Optional[int] = None,
     ) -> Optional[job.Job]:
+        if feature_group.transformation_functions:
+            dataframe = self._apply_transformation_function(
+                feature_group.transformation_functions, dataframe
+            )
+
         if (
             isinstance(feature_group, ExternalFeatureGroup)
             and feature_group.online_enabled
@@ -1319,7 +1342,7 @@ def _apply_transformation_function(
         # Raises
             `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View.
         """
-        transformed_features = set()
+        dropped_features = set()
 
         if isinstance(dataset, pl.DataFrame) or isinstance(
             dataset, pl.dataframe.frame.DataFrame
@@ -1342,7 +1365,7 @@ def _apply_transformation_function(
                     f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
 
-            transformed_features.update(tf.hopsworks_udf.transformation_features)
+            dropped_features.update(tf.hopsworks_udf.dropped_features)
             dataset = pd.concat(
                 [
                     dataset,
@@ -1357,7 +1380,7 @@ def _apply_transformation_function(
                 ],
                 axis=1,
             )
-        dataset = dataset.drop(transformed_features, axis=1)
+        dataset = dataset.drop(dropped_features, axis=1)
         return dataset
 
     @staticmethod
@@ -1536,8 +1559,11 @@ def acked(err: Exception, msg: Any) -> None:
         elif not isinstance(
             feature_group, ExternalFeatureGroup
         ) and self._start_offline_materialization(offline_write_options):
-            if (not offline_write_options.get("skip_offsets", False)
-                and self._job_api.last_execution(feature_group.materialization_job)): # always skip offsets if executing job for the first time
+            if not offline_write_options.get(
+                "skip_offsets", False
+            ) and self._job_api.last_execution(
+                feature_group.materialization_job
+            ):  # always skip offsets if executing job for the first time
                 # don't provide the current offsets (read from where the job last left off)
                 initial_check_point = ""
             # provide the initial_check_point as it will reduce the read amplification of materialization job
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index a22be38cc0..60f5f14854 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -351,6 +351,10 @@ def save_dataframe(
         validation_id=None,
     ):
         try:
+            if feature_group.transformation_functions:
+                dataframe = self._apply_transformation_function(
+                    feature_group.transformation_functions, dataframe
+                )
             if (
                 isinstance(feature_group, fg_mod.ExternalFeatureGroup)
                 and feature_group.online_enabled
@@ -395,6 +399,11 @@ def save_stream_dataframe(
         checkpoint_dir,
         write_options,
     ):
+        if feature_group.transformation_functions:
+            dataframe = self._apply_transformation_function(
+                feature_group.transformation_functions, dataframe
+            )
+
         write_options = self._get_kafka_config(
             feature_group.feature_store_id, write_options
         )
@@ -1115,8 +1124,29 @@ def read_options(self, data_format, provided_options):
             options.update(provided_options)
         return options
 
-    def parse_schema_feature_group(self, dataframe, time_travel_format=None):
+    def parse_schema_feature_group(
+        self,
+        dataframe,
+        time_travel_format=None,
+        transformation_functions: Optional[
+            List[transformation_function.TransformationFunction]
+        ] = None,
+    ):
         features = []
+        transformed_features = []
+        dropped_features = []
+
+        if transformation_functions:
+            for tf in transformation_functions:
+                transformed_features.append(
+                    feature.Feature(
+                        tf.hopsworks_udf.output_column_names[0],
+                        tf.hopsworks_udf.return_types[0],
+                        on_demand=True,
+                    )
+                )
+                dropped_features.extend(tf.hopsworks_udf.dropped_features)
+
         using_hudi = time_travel_format == "HUDI"
         for feat in dataframe.schema:
             name = util.autofix_feature_name(feat.name)
@@ -1126,12 +1156,13 @@ def parse_schema_feature_group(self, dataframe, time_travel_format=None):
                 )
             except ValueError as e:
                 raise FeatureStoreException(f"Feature '{feat.name}': {str(e)}") from e
-            features.append(
-                feature.Feature(
-                    name, converted_type, feat.metadata.get("description", None)
+            if name not in dropped_features:
+                features.append(
+                    feature.Feature(
+                        name, converted_type, feat.metadata.get("description", None)
+                    )
                 )
-            )
-        return features
+        return features + transformed_features
 
     def parse_schema_training_dataset(self, dataframe):
         return [
@@ -1244,7 +1275,7 @@ def _apply_transformation_function(
         # Raises
             `FeatureStoreException`: If any of the features mentioned in the transformation function is not present in the Feature View.
         """
-        transformed_features = set()
+        dropped_features = set()
         transformations = []
         transformation_features = []
         output_col_names = []
@@ -1260,7 +1291,7 @@ def _apply_transformation_function(
                     f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
 
-            transformed_features.update(tf.hopsworks_udf.transformation_features)
+            dropped_features.update(tf.hopsworks_udf.dropped_features)
 
             pandas_udf = hopsworks_udf.get_udf()
             output_col_name = hopsworks_udf.output_column_names[0]
@@ -1276,7 +1307,7 @@ def _apply_transformation_function(
 
         untransformed_columns = []  # Untransformed column maintained as a list since order is imported while selecting features.
         for column in dataset.columns:
-            if column not in transformed_features:
+            if column not in dropped_features:
                 untransformed_columns.append(column)
         # Applying transformations
         transformed_dataset = dataset.select(
diff --git a/python/hsfs/feature.py b/python/hsfs/feature.py
index 89f19b060d..412929a75e 100644
--- a/python/hsfs/feature.py
+++ b/python/hsfs/feature.py
@@ -53,6 +53,7 @@ def __init__(
                 "hsfs.feature_group.SpineGroup",
             ]
         ] = None,
+        on_demand: bool = False,
         **kwargs,
     ) -> None:
         self._name = util.autofix_feature_name(name)
@@ -67,6 +68,7 @@ def __init__(
             self._feature_group_id = feature_group.id
         else:
             self._feature_group_id = feature_group_id
+        self._on_demand = on_demand
 
     def to_dict(self) -> Dict[str, Any]:
         """Get structured info about specific Feature in python dictionary format.
@@ -93,6 +95,7 @@ def to_dict(self) -> Dict[str, Any]:
             "onlineType": self._online_type,
             "defaultValue": self._default_value,
             "featureGroupId": self._feature_group_id,
+            "onDemand": self.on_demand,
         }
 
     def json(self) -> str:
@@ -206,6 +209,15 @@ def default_value(self, default_value: Optional[str]) -> None:
     def feature_group_id(self) -> Optional[int]:
         return self._feature_group_id
 
+    @property
+    def on_demand(self) -> bool:
+        """Whether the feature is a on-demand feature computed using on-demand transformation functions"""
+        return self._on_demand
+
+    @on_demand.setter
+    def on_demand(self, on_demand) -> None:
+        self._on_demand = on_demand
+
     def __lt__(self, other: Any) -> "filter.Filter":
         return filter.Filter(self, filter.Filter.LT, other)
 
diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py
index de5577417c..0bbeb26552 100644
--- a/python/hsfs/feature_group.py
+++ b/python/hsfs/feature_group.py
@@ -73,8 +73,10 @@
 from hsfs.embedding import EmbeddingIndex
 from hsfs.expectation_suite import ExpectationSuite
 from hsfs.ge_validation_result import ValidationResult
+from hsfs.hopsworks_udf import HopsworksUdf, UDFType
 from hsfs.statistics import Statistics
 from hsfs.statistics_config import StatisticsConfig
+from hsfs.transformation_function import TransformationFunction
 from hsfs.validation_report import ValidationReport
 
 
@@ -543,8 +545,13 @@ def get_storage_connector(self):
         """
         storage_connector_provenance = self.get_storage_connector_provenance()
 
-        if storage_connector_provenance.inaccessible or storage_connector_provenance.deleted:
-            _logger.info("The parent storage connector is deleted or inaccessible. For more details access `get_storage_connector_provenance`")
+        if (
+            storage_connector_provenance.inaccessible
+            or storage_connector_provenance.deleted
+        ):
+            _logger.info(
+                "The parent storage connector is deleted or inaccessible. For more details access `get_storage_connector_provenance`"
+            )
 
         if storage_connector_provenance.accessible:
             return storage_connector_provenance.accessible[0]
@@ -2022,6 +2029,9 @@ def __init__(
             Union[Dict[str, Any], "deltastreamer_jobconf.DeltaStreamerJobConf"]
         ] = None,
         deprecated: bool = False,
+        transformation_functions: Optional[
+            List[Union[TransformationFunction, HopsworksUdf]]
+        ] = None,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -2124,6 +2134,44 @@ def __init__(
         self._feature_writers: Optional[Dict[str, callable]] = None
         self._writer: Optional[callable] = None
 
+        # On-Demand Transformation Functions
+        self._transformation_functions: List[TransformationFunction] = (
+            [
+                TransformationFunction(
+                    featurestore_id,
+                    hopsworks_udf=transformation_function,
+                    version=1,
+                    transformation_type=UDFType.ON_DEMAND,
+                )
+                if not isinstance(transformation_function, TransformationFunction)
+                else transformation_function
+                for transformation_function in transformation_functions
+            ]
+            if transformation_functions
+            else []
+        )
+
+        if self._transformation_functions:
+            self._transformation_functions = (
+                FeatureGroup._sort_transformation_functions(
+                    self._transformation_functions
+                )
+            )
+
+    @staticmethod
+    def _sort_transformation_functions(
+        transformation_functions: List[TransformationFunction],
+    ) -> List[TransformationFunction]:
+        """
+        Function that sorts transformation functions in the order of the output column names.
+        The list of transformation functions are sorted based on the output columns names to maintain consistent ordering.
+        # Arguments
+            transformation_functions:  `List[TransformationFunction]`. List of transformation functions to be sorted
+        # Returns
+            `List[TransformationFunction]`: List of transformation functions to be sorted
+        """
+        return sorted(transformation_functions, key=lambda x: x.output_column_names[0])
+
     def read(
         self,
         wallclock_time: Optional[Union[str, int, datetime, date]] = None,
@@ -3204,6 +3252,17 @@ def from_response_json(
                 json_decamelized["embedding_index"] = EmbeddingIndex.from_response_json(
                     json_decamelized["embedding_index"]
                 )
+            if "transformation_functions" in json_decamelized:
+                transformation_functions = json_decamelized["transformation_functions"]
+                json_decamelized["transformation_functions"] = [
+                    TransformationFunction.from_response_json(
+                        {
+                            **transformation_function,
+                            "transformation_type": UDFType.ON_DEMAND,
+                        }
+                    )
+                    for transformation_function in transformation_functions
+                ]
             return cls(**json_decamelized)
         for fg in json_decamelized:
             if "type" in fg:
@@ -3214,6 +3273,17 @@ def from_response_json(
                 fg["embedding_index"] = EmbeddingIndex.from_response_json(
                     fg["embedding_index"]
                 )
+            if "transformation_functions" in fg:
+                transformation_functions = fg["transformation_functions"]
+                fg["transformation_functions"] = [
+                    TransformationFunction.from_response_json(
+                        {
+                            **transformation_function,
+                            "transformation_type": UDFType.ON_DEMAND,
+                        }
+                    )
+                    for transformation_function in transformation_functions
+                ]
         return [cls(**fg) for fg in json_decamelized]
 
     def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureGroup":
@@ -3224,6 +3294,17 @@ def update_from_response_json(self, json_dict: Dict[str, Any]) -> "FeatureGroup"
             json_decamelized["embedding_index"] = EmbeddingIndex.from_response_json(
                 json_decamelized["embedding_index"]
             )
+        if "transformation_functions" in json_decamelized:
+            transformation_functions = json_decamelized["transformation_functions"]
+            json_decamelized["transformation_functions"] = [
+                TransformationFunction.from_response_json(
+                    {
+                        **transformation_function,
+                        "transformation_type": UDFType.ON_DEMAND,
+                    }
+                )
+                for transformation_function in transformation_functions
+            ]
         self.__init__(**json_decamelized)
         return self
 
@@ -3270,6 +3351,7 @@ def to_dict(self) -> Dict[str, Any]:
             "topicName": self.topic_name,
             "notificationTopicName": self.notification_topic_name,
             "deprecated": self.deprecated,
+            "transformationFunctions": self._transformation_functions,
         }
         if self.embedding_index:
             fg_meta_dict["embeddingIndex"] = self.embedding_index.to_dict()
@@ -3376,6 +3458,13 @@ def statistics(self) -> "Statistics":
             )
         return super().statistics
 
+    @property
+    def transformation_functions(
+        self,
+    ) -> List[TransformationFunction]:
+        """Get transformation functions."""
+        return self._transformation_functions
+
     @description.setter
     def description(self, new_description: Optional[str]) -> None:
         self._description = new_description
@@ -3402,6 +3491,13 @@ def stream(self, stream: bool) -> None:
     def parents(self, new_parents: "explicit_provenance.Links") -> None:
         self._parents = new_parents
 
+    @transformation_functions.setter
+    def transformation_functions(
+        self,
+        transformation_functions: List[TransformationFunction],
+    ) -> None:
+        self._transformation_functions = transformation_functions
+
 
 @typechecked
 class ExternalFeatureGroup(FeatureGroupBase):
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 11eeac1983..4da096d80c 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -510,6 +510,9 @@ def create_feature_group(
         parents: Optional[List[feature_group.FeatureGroup]] = None,
         topic_name: Optional[str] = None,
         notification_topic_name: Optional[str] = None,
+        transformation_functions: Optional[
+            List[Union[TransformationFunction, HopsworksUdf]]
+        ] = None,
     ) -> "feature_group.FeatureGroup":
         """Create a feature group metadata object.
 
@@ -592,6 +595,7 @@ def create_feature_group(
                 defaults to using project topic.
             notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries
                 are inserted or updated on the online feature store. If left undefined no notifications are sent.
+            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
 
         # Returns
             `FeatureGroup`. The feature group metadata object.
@@ -616,6 +620,7 @@ def create_feature_group(
             parents=parents or [],
             topic_name=topic_name,
             notification_topic_name=notification_topic_name,
+            transformation_functions=transformation_functions,
         )
         feature_group_object.feature_store = self
         return feature_group_object
@@ -642,6 +647,9 @@ def get_or_create_feature_group(
         parents: Optional[List[feature_group.FeatureGroup]] = None,
         topic_name: Optional[str] = None,
         notification_topic_name: Optional[str] = None,
+        transformation_functions: Optional[
+            List[Union[TransformationFunction, HopsworksUdf]]
+        ] = None,
     ) -> Union[
         "feature_group.FeatureGroup",
         "feature_group.ExternalFeatureGroup",
@@ -726,6 +734,7 @@ def get_or_create_feature_group(
                 defaults to using project topic.
             notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries
                 are inserted or updated on the online feature store. If left undefined no notifications are sent.
+            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
 
         # Returns
             `FeatureGroup`. The feature group metadata object.
@@ -759,6 +768,7 @@ def get_or_create_feature_group(
                     parents=parents or [],
                     topic_name=topic_name,
                     notification_topic_name=notification_topic_name,
+                    transformation_functions=transformation_functions,
                 )
                 feature_group_object.feature_store = self
                 return feature_group_object
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 9ca317a473..f2f5019160 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -54,7 +54,7 @@
 from hsfs.core.vector_db_client import VectorDbClient
 from hsfs.decorators import typechecked
 from hsfs.feature import Feature
-from hsfs.hopsworks_udf import HopsworksUdf
+from hsfs.hopsworks_udf import HopsworksUdf, UDFType
 from hsfs.statistics import Statistics
 from hsfs.statistics_config import StatisticsConfig
 from hsfs.training_dataset_split import TrainingDatasetSplit
@@ -126,6 +126,7 @@ def __init__(
                     self.featurestore_id,
                     hopsworks_udf=transformation_function,
                     version=1,
+                    transformation_type=UDFType.MODEL_DEPENDENT,
                 )
                 if not isinstance(transformation_function, TransformationFunction)
                 else transformation_function
@@ -493,6 +494,7 @@ def get_feature_vector(
         allow_missing: bool = False,
         force_rest_client: bool = False,
         force_sql_client: bool = False,
+        request_parameters: Optional[Dict[str, Any]] = None,
     ) -> Union[List[Any], pd.DataFrame, np.ndarray, pl.DataFrame]:
         """Returns assembled feature vector from online feature store.
             Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed.
@@ -566,6 +568,7 @@ def get_feature_vector(
             force_sql_client: boolean, defaults to False. If set to True, reads from online feature store
                 using the SQL client if initialised.
             allow_missing: Setting to `True` returns feature vectors with missing values.
+            request_parameters: Request parameters required by on-demand transformation functions.
 
         # Returns
             `list`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list"`, `"pandas"`, `"polars"` or `"numpy"`
@@ -591,6 +594,7 @@ def get_feature_vector(
             vector_db_features=vector_db_features,
             force_rest_client=force_rest_client,
             force_sql_client=force_sql_client,
+            request_parameters=request_parameters,
         )
 
     def get_feature_vectors(
@@ -602,6 +606,7 @@ def get_feature_vectors(
         allow_missing: bool = False,
         force_rest_client: bool = False,
         force_sql_client: bool = False,
+        request_parameters: Optional[List[Dict[str, Any]]] = None,
     ) -> Union[List[List[Any]], pd.DataFrame, np.ndarray, pl.DataFrame]:
         """Returns assembled feature vectors in batches from online feature store.
             Call [`feature_view.init_serving`](#init_serving) before this method if the following configurations are needed.
@@ -700,6 +705,7 @@ def get_feature_vectors(
             vector_db_features=vector_db_features,
             force_rest_client=force_rest_client,
             force_sql_client=force_sql_client,
+            request_parameters=request_parameters,
         )
 
     def get_inference_helper(
@@ -853,6 +859,10 @@ def find_neighbors(
         the number of results returned may be less than k. Try using a large value of k and extract the top k
         items from the results if needed.
 
+        !!! warning "Duplicate column error in Polars"
+            If the feature view has duplicate column names, attempting to create a polars DataFrame
+            will raise an error. To avoid this, set `return_type` to `"list"` or `"pandas"`.
+
         # Arguments
             embedding: The target embedding for which neighbors are to be found.
             feature: The feature used to compute similarity score. Required only if there
@@ -1024,7 +1034,7 @@ def get_batch_data(
             start_time,
             end_time,
             self._batch_scoring_server.training_dataset_version,
-            self._batch_scoring_server._transformation_functions,
+            self._batch_scoring_server._model_dependent_transformation_functions,
             read_options,
             spine,
             primary_keys,
@@ -3442,7 +3452,12 @@ def from_response_json(cls, json_dict: Dict[str, Any]) -> "FeatureView":
             featurestore_name=json_decamelized.get("featurestore_name", None),
             serving_keys=serving_keys,
             transformation_functions=[
-                TransformationFunction.from_response_json(transformation_function)
+                TransformationFunction.from_response_json(
+                    {
+                        **transformation_function,
+                        "transformation_type": UDFType.MODEL_DEPENDENT,
+                    }
+                )
                 for transformation_function in transformation_functions
             ]
             if transformation_functions
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index b9f8bde5bb..0a005134a6 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -21,6 +21,7 @@
 import warnings
 from dataclasses import dataclass
 from datetime import date, datetime, time
+from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import humps
@@ -31,7 +32,14 @@
 from hsfs.transformation_statistics import TransformationStatistics
 
 
-def udf(return_type: Union[List[type], type]) -> "HopsworksUdf":
+class UDFType(Enum):
+    MODEL_DEPENDENT = "model_dependent"
+    ON_DEMAND = "on_demand"
+
+
+def udf(
+    return_type: Union[List[type], type], drop: Optional[Union[str, List[str]]] = None
+) -> "HopsworksUdf":
     """
     Create an User Defined Function that can be and used within the Hopsworks Feature Store.
 
@@ -46,13 +54,14 @@ def udf(return_type: Union[List[type], type]) -> "HopsworksUdf":
         ```python
         from hopsworks import udf
 
-        @udf(float)
+       @udf(float)
         def add_one(data1 : pd.Series):
             return data1 + 1
         ```
 
     # Arguments
         return_type: `list`. The output types of the defined UDF
+        drop: `List[str]`. The features to be dropped after application of transformation functions
 
     # Returns
         `HopsworksUdf`: The metadata object for hopsworks UDF's.
@@ -62,7 +71,7 @@ def add_one(data1 : pd.Series):
     """
 
     def wrapper(func: Callable) -> HopsworksUdf:
-        udf = HopsworksUdf(func=func, return_types=return_type)
+        udf = HopsworksUdf(func=func, return_types=return_type, dropped_features=drop)
         return udf
 
     return wrapper
@@ -127,11 +136,17 @@ def __init__(
         return_types: Union[List[type], type, List[str], str],
         name: Optional[str] = None,
         transformation_features: Optional[List[TransformationFeature]] = None,
+        dropped_features: Optional[List[str]] = None,
+        feature_name_prefix: Optional[str] = None,
     ):
         self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types(
             return_types
         )
 
+        self._feature_name_prefix: Optional[str] = (
+            feature_name_prefix  # Prefix to be added to feature names
+        )
+
         self._function_name: str = func.__name__ if name is None else name
 
         self._function_source: str = (
@@ -152,9 +167,55 @@ def __init__(
             HopsworksUdf._format_source_code(self._function_source)
         )
 
+        self._dropped_features: List[str] = (
+            HopsworksUdf._validate_and_convert_drop_features(
+                dropped_features, self.transformation_features, feature_name_prefix
+            )
+        )
+
         self._statistics: Optional[TransformationStatistics] = None
 
-        self._output_column_names: List[str] = self._get_output_column_names()
+        self._udf_type: UDFType = None
+
+        self._output_column_names: List[str] = []
+
+    @staticmethod
+    def _validate_and_convert_drop_features(
+        dropped_features: Union[str, List[str]],
+        transformation_feature: List[str],
+        feature_name_prefix: str,
+    ) -> List[str]:
+        """
+        Function that converts dropped features to a list and validates if the dropped feature is present in the transformation function
+        # Arguments
+            dropped_features: `Union[str, List[str]]`. Features of be dropped.
+            transformation_feature: `List[str]`. Features to be transformed in the UDF
+        # Returns
+            `List[str]`: A list of features to be dropped.
+        """
+        if not dropped_features:
+            return []
+
+        dropped_features = (
+            [dropped_features]
+            if not isinstance(dropped_features, list)
+            else dropped_features
+        )
+
+        feature_name_prefix = feature_name_prefix if feature_name_prefix else ""
+
+        missing_drop_features = []
+        for dropped_feature in dropped_features:
+            if feature_name_prefix + dropped_feature not in transformation_feature:
+                missing_drop_features.append(dropped_feature)
+
+        if missing_drop_features:
+            missing_drop_features = "', '".join(missing_drop_features)
+            raise FeatureStoreException(
+                f"Cannot drop features '{missing_drop_features}' as they are not features given as arguments in the defined UDF."
+            )
+
+        return dropped_features
 
     @staticmethod
     def _validate_and_convert_output_types(
@@ -365,13 +426,18 @@ def _get_output_column_names(self) -> str:
         # Returns
             `List[str]`: List of feature names for the transformed columns
         """
-        _BASE_COLUMN_NAME = (
-            f'{self.function_name}_{"_".join(self.transformation_features)}_'
-        )
-        if len(self.return_types) > 1:
-            return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))]
-        else:
-            return [f"{_BASE_COLUMN_NAME}"]
+        if self._udf_type == UDFType.MODEL_DEPENDENT:
+            _BASE_COLUMN_NAME = (
+                f'{self.function_name}_{"-".join(self.transformation_features)}_'
+            )
+            if len(self.return_types) > 1:
+                return [
+                    f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))
+                ]
+            else:
+                return [f"{_BASE_COLUMN_NAME}"]
+        elif self._udf_type == UDFType.ON_DEMAND:
+            return [self.function_name]
 
     def _create_pandas_udf_return_schema_from_list(self) -> str:
         """
@@ -479,6 +545,13 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
                 raise FeatureStoreException(
                     f'Feature names provided must be string "{arg}" is not string'
                 )
+        transformation_feature_name = self.transformation_features
+        index_dropped_features = [
+            transformation_feature_name.index(dropped_feature)
+            for dropped_feature in self.dropped_features
+        ]
+        updated_dropped_features = [features[index] for index in index_dropped_features]
+
         # Create a copy of the UDF to associate it with new feature names.
         udf = copy.deepcopy(self)
 
@@ -491,6 +564,7 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
             )
         ]
         udf.output_column_names = udf._get_output_column_names()
+        udf.dropped_features = updated_dropped_features
         return udf
 
     def update_return_type_one_hot(self):
@@ -541,10 +615,12 @@ def to_dict(self) -> Dict[str, Any]:
             "sourceCode": self._function_source,
             "outputTypes": self.return_types,
             "transformationFeatures": self.transformation_features,
+            "droppedFeatures": self.dropped_features,
             "statisticsArgumentNames": self._statistics_argument_names
             if self.statistics_required
             else None,
             "name": self._function_name,
+            "featureNamePrefix": self._feature_name_prefix,
         }
 
     def json(self) -> str:
@@ -572,12 +648,17 @@ def from_response_json(
         json_decamelized = humps.decamelize(json_dict)
         function_source_code = json_decamelized["source_code"]
         function_name = json_decamelized["name"]
+        feature_name_prefix = json_decamelized.get("feature_name_prefix", None)
         output_types = [
             output_type.strip() for output_type in json_decamelized["output_types"]
         ]
         transformation_features = [
             feature.strip() for feature in json_decamelized["transformation_features"]
         ]
+        dropped_features = [
+            dropped_feature.strip()
+            for dropped_feature in json_decamelized["dropped_features"]
+        ]
         statistics_features = (
             [
                 feature.strip()
@@ -590,10 +671,6 @@ def from_response_json(
         # Reconstructing statistics arguments.
         arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code)
 
-        transformation_features = (
-            arg_list if not transformation_features else transformation_features
-        )
-
         if statistics_features:
             transformation_features = [
                 TransformationFeature(
@@ -615,11 +692,28 @@ def from_response_json(
             return_types=output_types,
             name=function_name,
             transformation_features=transformation_features,
+            dropped_features=dropped_features,
+            feature_name_prefix=feature_name_prefix,
         )
 
         # Set transformation features if already set.
         return hopsworks_udf
 
+    def _validate_udf_type(self):
+        if self.udf_type is None:
+            raise FeatureStoreException("UDF Type cannot be None")
+
+        if self._udf_type == UDFType.ON_DEMAND:
+            if len(self.return_types) > 1:
+                raise FeatureStoreException(
+                    "On-Demand Transformation functions can only return one column as output"
+                )
+
+            if self.statistics_required:
+                raise FeatureStoreException(
+                    "On-Demand Transformation functions cannot use statistics, please remove statistics parameters from the functions"
+                )
+
     @property
     def return_types(self) -> List[str]:
         """Get the output types of the UDF"""
@@ -648,17 +742,30 @@ def transformation_statistics(
     @property
     def output_column_names(self) -> List[str]:
         """Output columns names of the transformation function"""
-        return self._output_column_names
+        if self._feature_name_prefix:
+            return [
+                self._feature_name_prefix + output_col_name
+                for output_col_name in self._output_column_names
+            ]
+        else:
+            return self._output_column_names
 
     @property
     def transformation_features(self) -> List[str]:
         """
         List of feature names to be used in the User Defined Function.
         """
-        return [
-            transformation_feature.feature_name
-            for transformation_feature in self._transformation_features
-        ]
+        if self._feature_name_prefix:
+            return [
+                self._feature_name_prefix + transformation_feature.feature_name
+                for transformation_feature in self._transformation_features
+            ]
+
+        else:
+            return [
+                transformation_feature.feature_name
+                for transformation_feature in self._transformation_features
+            ]
 
     @property
     def statistics_features(self) -> List[str]:
@@ -692,6 +799,33 @@ def _statistics_argument_names(self) -> List[str]:
             if transformation_feature.statistic_argument_name is not None
         ]
 
+    @property
+    def udf_type(self) -> UDFType:
+        """Type of the UDF : Can be \"model dependent\" or \"on-demand\" """
+        return self._udf_type
+
+    @udf_type.setter
+    def udf_type(self, udf_type: UDFType) -> None:
+        self._udf_type = udf_type
+        self._validate_udf_type()
+        self._output_column_names = self._get_output_column_names()
+
+    @property
+    def dropped_features(self) -> List[str]:
+        if self._feature_name_prefix:
+            return [
+                self._feature_name_prefix + dropped_feature
+                for dropped_feature in self._dropped_features
+            ]
+        else:
+            return self._dropped_features
+
+    @dropped_features.setter
+    def dropped_features(self, features: List[str]) -> None:
+        self._dropped_features = HopsworksUdf._validate_and_convert_drop_features(
+            features, self.transformation_features, self._feature_name_prefix
+        )
+
     @transformation_statistics.setter
     def transformation_statistics(
         self, statistics: List[FeatureDescriptiveStatistics]
@@ -713,3 +847,6 @@ def output_column_names(self, output_col_names: Union[str, List[str]]) -> None:
             )
         else:
             self._output_column_names = output_col_names
+
+    def __repr__(self):
+        return f'{self.function_name}({", ".join(self.transformation_features)})'
diff --git a/python/hsfs/training_dataset_feature.py b/python/hsfs/training_dataset_feature.py
index a06637abe2..3aa3f6a81f 100644
--- a/python/hsfs/training_dataset_feature.py
+++ b/python/hsfs/training_dataset_feature.py
@@ -15,10 +15,14 @@
 #
 from __future__ import annotations
 
+from typing import Optional
+
 import humps
 from hsfs import feature as feature_mod
 from hsfs import feature_group as feature_group_mod
 from hsfs import util
+from hsfs.hopsworks_udf import UDFType
+from hsfs.transformation_function import TransformationFunction
 
 
 class TrainingDatasetFeature:
@@ -32,6 +36,7 @@ def __init__(
         label=False,
         inference_helper_column=False,
         training_helper_column=False,
+        transformation_function: Optional[TransformationFunction] = None,
         **kwargs,
     ):
         self._name = util.autofix_feature_name(name)
@@ -47,6 +52,10 @@ def __init__(
         self._inference_helper_column = inference_helper_column
         self._training_helper_column = training_helper_column
 
+        self._on_demand_transformation_function: Optional[TransformationFunction] = (
+            transformation_function if transformation_function else None
+        )
+
     def to_dict(self):
         return {
             "name": self._name,
@@ -57,11 +66,21 @@ def to_dict(self):
             "trainingHelperColumn": self._training_helper_column,
             "featureGroupFeatureName": self._feature_group_feature_name,
             "featuregroup": self._feature_group,
+            "transformation_function": self._on_demand_transformation_function,
         }
 
     @classmethod
     def from_response_json(cls, json_dict):
         json_decamelized = humps.decamelize(json_dict)
+        if json_decamelized.get("transformation_function", False):
+            json_decamelized["transformation_function"]["transformation_type"] = (
+                UDFType.ON_DEMAND
+            )
+            json_decamelized["transformation_function"] = (
+                TransformationFunction.from_response_json(
+                    json_decamelized.get("transformation_function")
+                )
+            )
         return cls(**json_decamelized)
 
     def is_complex(self):
@@ -110,6 +129,11 @@ def inference_helper_column(self):
     def inference_helper_column(self, inference_helper_column):
         self._inference_helper_column = inference_helper_column
 
+    @property
+    def on_demand_transformation_function(self) -> TransformationFunction:
+        """Whether the feature is a on-demand feature computed using on-demand transformation functions"""
+        return self._on_demand_transformation_function
+
     @property
     def training_helper_column(self):
         """Indicator if it is feature."""
@@ -128,4 +152,4 @@ def feature_group_feature_name(self):
         return self._feature_group_feature_name
 
     def __repr__(self):
-        return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r})"
+        return f"Training Dataset Feature({self._name!r}, {self._type!r}, {self._index!r}, {self._label}, {self._feature_group_feature_name}, {self._feature_group.id!r}, {self.on_demand_transformation_function})"
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index a3f6a295d7..65535aa539 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -23,7 +23,7 @@
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core import transformation_function_engine
 from hsfs.decorators import typechecked
-from hsfs.hopsworks_udf import HopsworksUdf
+from hsfs.hopsworks_udf import HopsworksUdf, UDFType
 
 
 @typechecked
@@ -44,6 +44,7 @@ def __init__(
         hopsworks_udf: HopsworksUdf,
         version: Optional[int] = None,
         id: Optional[int] = None,
+        transformation_type: Optional[UDFType] = None,
         type=None,
         items=None,
         count=None,
@@ -65,6 +66,7 @@ def __init__(
             )
 
         self._hopsworks_udf: HopsworksUdf = hopsworks_udf
+        self._hopsworks_udf.udf_type = transformation_type
 
     def save(self) -> None:
         """Save a transformation function into the backend.
@@ -233,3 +235,11 @@ def hopsworks_udf(self) -> HopsworksUdf:
     def output_column_names(self) -> List[str]:
         """Output column names of transformation functions"""
         return self._hopsworks_udf._output_column_names
+
+    def __repr__(self):
+        if self.hopsworks_udf._udf_type == UDFType.MODEL_DEPENDENT:
+            return (
+                f"Model-Dependent Transformation Function : {repr(self.hopsworks_udf)}"
+            )
+        else:
+            return f"On-Demand Transformation Function : {repr(self.hopsworks_udf)}"
diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
index 8494d018f1..6595207ed3 100644
--- a/python/tests/test_hopswork_udf.py
+++ b/python/tests/test_hopswork_udf.py
@@ -19,7 +19,7 @@
 import pandas as pd
 import pytest
 from hsfs.client.exceptions import FeatureStoreException
-from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, udf
+from hsfs.hopsworks_udf import HopsworksUdf, TransformationFeature, UDFType, udf
 
 
 class TestHopsworksUdf:
@@ -330,14 +330,21 @@ def test_generate_output_column_names_one_argument_one_output_type(self):
         def test_func(col1):
             return col1 + 1
 
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
         assert test_func._get_output_column_names() == ["test_func_col1_"]
 
+        test_func.udf_type = UDFType.ON_DEMAND
+        assert test_func._get_output_column_names() == ["test_func"]
+
     def test_generate_output_column_names_multiple_argument_one_output_type(self):
         @udf(int)
         def test_func(col1, col2, col3):
             return col1 + 1
 
-        assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"]
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+        assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"]
+        test_func.udf_type = UDFType.ON_DEMAND
+        assert test_func._get_output_column_names() == ["test_func"]
 
     def test_generate_output_column_names_single_argument_multiple_output_type(self):
         @udf([int, float, int])
@@ -346,6 +353,7 @@ def test_func(col1):
                 {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]}
             )
 
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
         assert test_func._get_output_column_names() == [
             "test_func_col1_0",
             "test_func_col1_1",
@@ -359,10 +367,11 @@ def test_func(col1, col2, col3):
                 {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
             )
 
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
         assert test_func._get_output_column_names() == [
-            "test_func_col1_col2_col3_0",
-            "test_func_col1_col2_col3_1",
-            "test_func_col1_col2_col3_2",
+            "test_func_col1-col2-col3_0",
+            "test_func_col1-col2-col3_1",
+            "test_func_col1-col2-col3_2",
         ]
 
     def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
@@ -388,30 +397,45 @@ def test_func(col1):
                 }
             )
 
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+
         assert (
             test_func._create_pandas_udf_return_schema_from_list()
             == "`test_func_col1_0` bigint, `test_func_col1_1` double, `test_func_col1_2` string, `test_func_col1_3` date, `test_func_col1_4` timestamp, `test_func_col1_5` timestamp, `test_func_col1_6` boolean"
         )
 
     def test_hopsworks_wrapper_single_output(self):
+        test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]})
+
         @udf(int)
         def test_func(col1):
             return col1 + 1
 
-        renaming_wrapper_function = test_func.hopsworksUdf_wrapper()
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
 
-        test_dataframe = pd.DataFrame({"col1": [1, 2, 3, 4]})
+        renaming_wrapper_function = test_func.hopsworksUdf_wrapper()
 
         result = renaming_wrapper_function(test_dataframe["col1"])
 
         assert result.name == "test_func_col1_"
         assert result.values.tolist() == [2, 3, 4, 5]
 
+        test_func.udf_type = UDFType.ON_DEMAND
+
+        renaming_wrapper_function = test_func.hopsworksUdf_wrapper()
+
+        result = renaming_wrapper_function(test_dataframe["col1"])
+
+        assert result.name == "test_func"
+        assert result.values.tolist() == [2, 3, 4, 5]
+
     def test_hopsworks_wrapper_multiple_output(self):
         @udf([int, float])
         def test_func(col1, col2):
             return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2})
 
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+
         renaming_wrapper_function = test_func.hopsworksUdf_wrapper()
 
         test_dataframe = pd.DataFrame(
@@ -422,7 +446,7 @@ def test_func(col1, col2):
             test_dataframe["column1"], test_dataframe["column2"]
         )
 
-        assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"])
+        assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"])
         assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]]
 
     def test_HopsworkUDf_call_one_argument(self):

From e87331eadea4535a0c7bf6715cded130b9aff4a2 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Wed, 3 Jul 2024 23:28:20 +0200
Subject: [PATCH 54/58] fixing unit tests

---
 python/hsfs/core/training_dataset_engine.py   | 19 +----
 python/hsfs/engine/python.py                  |  2 +-
 python/hsfs/hopsworks_udf.py                  | 24 ++++--
 python/hsfs/training_dataset.py               | 19 -----
 python/tests/core/test_feature_view_engine.py | 76 -----------------
 .../core/test_training_dataset_engine.py      | 72 +----------------
 .../test_transformation_function_engine.py    | 12 ++-
 python/tests/engine/test_python.py            | 10 ++-
 ...t_python_spark_transformation_functions.py | 79 ++++++++++++------
 python/tests/engine/test_spark.py             | 19 +++--
 .../tests/fixtures/feature_view_fixtures.json |  6 +-
 .../fixtures/training_dataset_fixtures.json   | 12 +--
 .../transformation_function_fixtures.json     | 21 +++--
 python/tests/test_builtin_transformation.py   | 81 +++++++++++++++++++
 python/tests/test_hopswork_udf.py             | 10 +--
 python/tests/test_training_dataset.py         |  2 -
 python/tests/test_transformation_function.py  | 13 ++-
 17 files changed, 221 insertions(+), 256 deletions(-)
 create mode 100644 python/tests/test_builtin_transformation.py

diff --git a/python/hsfs/core/training_dataset_engine.py b/python/hsfs/core/training_dataset_engine.py
index 8d47adf165..34907ce3ca 100644
--- a/python/hsfs/core/training_dataset_engine.py
+++ b/python/hsfs/core/training_dataset_engine.py
@@ -22,7 +22,6 @@
 from hsfs.core import (
     tags_api,
     training_dataset_api,
-    transformation_function_engine,
 )
 
 
@@ -38,11 +37,6 @@ def __init__(self, feature_store_id):
             feature_store_id
         )
         self._tags_api = tags_api.TagsApi(feature_store_id, self.ENTITY_TYPE)
-        self._transformation_function_engine = (
-            transformation_function_engine.TransformationFunctionEngine(
-                feature_store_id
-            )
-        )
 
     def save(self, training_dataset, features, user_write_options):
         if isinstance(features, query.Query):
@@ -53,9 +47,6 @@ def save(self, training_dataset, features, user_write_options):
                 )
                 for label_name in training_dataset.label
             ]
-            self._transformation_function_engine.attach_transformation_fn(
-                training_dataset
-            )
         else:
             features = engine.get_instance().convert_to_default_dataframe(features)
             training_dataset._features = (
@@ -66,19 +57,11 @@ def save(self, training_dataset, features, user_write_options):
                     if feature.name == label_name:
                         feature.label = True
 
-            # check if user provided transformation functions and throw error as transformation functions work only
-            # with query objects
-            if training_dataset.transformation_functions:
-                raise ValueError(
-                    "Transformation functions can only be applied to training datasets generated from Query object"
-                )
-
         if len(training_dataset.splits) > 0 and training_dataset.train_split is None:
             training_dataset.train_split = "train"
             warnings.warn(
                 "Training dataset splits were defined but no `train_split` (the name of the split that is going to be "
-                "used for training) was provided. Setting this property to `train`. The statistics of this "
-                "split will be used for transformation functions.",
+                "used for training) was provided. Setting this property to `train`. ",
                 stacklevel=1,
             )
 
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index fea3dd0301..9c2a4ca279 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -966,7 +966,7 @@ def get_training_data(
             #        training_dataset_obj, feature_view_obj, training_dataset_version
             #    )
             return self._apply_transformation_function(
-                training_dataset_obj.transformation_functions, df
+                feature_view_obj.transformation_functions, df
             )
 
     def split_labels(
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 0a005134a6..a17e432009 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -428,7 +428,7 @@ def _get_output_column_names(self) -> str:
         """
         if self._udf_type == UDFType.MODEL_DEPENDENT:
             _BASE_COLUMN_NAME = (
-                f'{self.function_name}_{"-".join(self.transformation_features)}_'
+                f'{self.function_name}_{"_".join(self.transformation_features)}_'
             )
             if len(self.return_types) > 1:
                 return [
@@ -655,10 +655,14 @@ def from_response_json(
         transformation_features = [
             feature.strip() for feature in json_decamelized["transformation_features"]
         ]
-        dropped_features = [
-            dropped_feature.strip()
-            for dropped_feature in json_decamelized["dropped_features"]
-        ]
+        dropped_features = (
+            [
+                dropped_feature.strip()
+                for dropped_feature in json_decamelized["dropped_features"]
+            ]
+            if "dropped_features" in json_decamelized
+            else None
+        )
         statistics_features = (
             [
                 feature.strip()
@@ -671,6 +675,16 @@ def from_response_json(
         # Reconstructing statistics arguments.
         arg_list, _, _, _ = HopsworksUdf._parse_function_signature(function_source_code)
 
+        transformation_features = (
+            arg_list if not transformation_features else transformation_features
+        )
+
+        if dropped_features:
+            dropped_features = [
+                transformation_features[arg_list.index(dropped_feature)]
+                for dropped_feature in dropped_features
+            ]
+
         if statistics_features:
             transformation_features = [
                 TransformationFeature(
diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py
index 5f51044546..f19b95e037 100644
--- a/python/hsfs/training_dataset.py
+++ b/python/hsfs/training_dataset.py
@@ -29,7 +29,6 @@
     statistics_engine,
     training_dataset_api,
     training_dataset_engine,
-    transformation_function_engine,
     vector_server,
 )
 from hsfs.statistics_config import StatisticsConfig
@@ -538,7 +537,6 @@ def __init__(
         from_query=None,
         querydto=None,
         label=None,
-        transformation_functions=None,
         train_split=None,
         time_split_size=None,
         extra_filter=None,
@@ -580,7 +578,6 @@ def __init__(
         self._querydto = querydto
         self._feature_store_id = featurestore_id
         self._feature_store_name = featurestore_name
-        self._transformation_functions = transformation_functions
 
         self._training_dataset_api = training_dataset_api.TrainingDatasetApi(
             featurestore_id
@@ -592,9 +589,6 @@ def __init__(
             featurestore_id, self.ENTITY_TYPE
         )
         self._code_engine = code_engine.CodeEngine(featurestore_id, self.ENTITY_TYPE)
-        self._transformation_function_engine = (
-            transformation_function_engine.TransformationFunctionEngine(featurestore_id)
-        )
         self._vector_server = vector_server.VectorServer(
             featurestore_id, features=self._features
         )
@@ -1084,19 +1078,6 @@ def feature_store_name(self) -> str:
         """Name of the feature store in which the feature group is located."""
         return self._feature_store_name
 
-    @property
-    def transformation_functions(self):
-        """Set transformation functions."""
-        if self._id is not None and self._transformation_functions is None:
-            self._transformation_functions = (
-                self._transformation_function_engine.get_td_transformation_fn(self)
-            )
-        return self._transformation_functions
-
-    @transformation_functions.setter
-    def transformation_functions(self, transformation_functions):
-        self._transformation_functions = transformation_functions
-
     @property
     def serving_keys(self) -> Set[str]:
         """Set of primary key names that is used as keys in input dict object for `get_serving_vector` method."""
diff --git a/python/tests/core/test_feature_view_engine.py b/python/tests/core/test_feature_view_engine.py
index f6a141fb20..f1c3f7ab3d 100644
--- a/python/tests/core/test_feature_view_engine.py
+++ b/python/tests/core/test_feature_view_engine.py
@@ -29,9 +29,7 @@
 from hsfs.constructor.query import Query
 from hsfs.core import arrow_flight_client, feature_view_engine
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
-from hsfs.hopsworks_udf import udf
 from hsfs.storage_connector import BigQueryConnector, StorageConnector
-from hsfs.transformation_function import TransformationFunction
 
 
 engine.init("python")
@@ -349,9 +347,6 @@ def test_get_name(self, mocker):
         feature_store_id = 99
 
         mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-        mocker.patch(
-            "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn"
-        )
 
         fv_engine = feature_view_engine.FeatureViewEngine(
             feature_store_id=feature_store_id
@@ -387,9 +382,6 @@ def test_get_name_version(self, mocker):
         feature_store_id = 99
 
         mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-        mocker.patch(
-            "hsfs.core.feature_view_engine.FeatureViewEngine.get_attached_transformation_fn"
-        )
 
         fv_engine = feature_view_engine.FeatureViewEngine(
             feature_store_id=feature_store_id
@@ -555,74 +547,6 @@ def test_get_batch_query_string_pit_query(self, mocker):
         assert mock_fv_api.return_value.get_batch_query.call_count == 1
         assert mock_qc_api.return_value.construct_query.call_count == 1
 
-    def test_get_attached_transformation_fn(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-
-        fv_engine = feature_view_engine.FeatureViewEngine(
-            feature_store_id=feature_store_id
-        )
-
-        @udf(int)
-        def test2(col1):
-            return col1 + 1
-
-        tf = TransformationFunction(
-            featurestore_id=10,
-            hopsworks_udf=test2,
-        )
-
-        mock_fv_api.return_value.get_attached_transformation_fn.return_value = [tf]
-
-        # Act
-        result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1)
-
-        # Assert
-        assert result == [tf]
-        assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1
-
-    def test_get_attached_transformation_fn_multiple(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mock_fv_api = mocker.patch("hsfs.core.feature_view_api.FeatureViewApi")
-
-        fv_engine = feature_view_engine.FeatureViewEngine(
-            feature_store_id=feature_store_id
-        )
-
-        @udf(int)
-        def test1(col1):
-            return col1 + 1
-
-        tf1 = TransformationFunction(
-            featurestore_id=10,
-            hopsworks_udf=test1,
-        )
-
-        @udf(int)
-        def test2(col1):
-            return col1 + 2
-
-        tf2 = TransformationFunction(
-            featurestore_id=10,
-            hopsworks_udf=test2,
-        )
-
-        mock_fv_api.return_value.get_attached_transformation_fn.return_value = [
-            tf1,
-            tf2,
-        ]
-
-        # Act
-        result = fv_engine.get_attached_transformation_fn(name="fv_name", version=1)
-
-        # Assert
-        assert result == [tf1, tf2]
-        assert mock_fv_api.return_value.get_attached_transformation_fn.call_count == 1
-
     def test_create_training_dataset(self, mocker):
         # Arrange
         feature_store_id = 99
diff --git a/python/tests/core/test_training_dataset_engine.py b/python/tests/core/test_training_dataset_engine.py
index fea3d43f88..c1a55ca00a 100644
--- a/python/tests/core/test_training_dataset_engine.py
+++ b/python/tests/core/test_training_dataset_engine.py
@@ -14,16 +14,13 @@
 #   limitations under the License.
 #
 
-import pytest
 from hsfs import (
     feature_group,
     training_dataset,
     training_dataset_feature,
-    transformation_function,
 )
 from hsfs.constructor import query
 from hsfs.core import training_dataset_engine
-from hsfs.hopsworks_udf import udf
 
 
 class TestTrainingDatasetEngine:
@@ -32,9 +29,6 @@ def test_save(self, mocker):
         feature_store_id = 99
 
         mocker.patch("hsfs.client.get_instance")
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
-        )
         mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
         mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
 
@@ -76,9 +70,6 @@ def test_save_query(self, mocker, backend_fixtures):
         mocker.patch("hsfs.client.get_instance")
         mocker.patch("hsfs.engine.get_type")
 
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
-        )
         mocker.patch("hsfs.engine.get_instance")
         mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
 
@@ -107,70 +98,12 @@ def test_save_query(self, mocker, backend_fixtures):
         assert td._features[0].label is True
         assert td._features[1].label is True
 
-    def test_save_transformation_functions(self, mocker):
-        # Arrange
-        feature_store_id = 99
-
-        mocker.patch("hsfs.client.get_instance")
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
-        )
-        mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
-        mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
-
-        @udf(int)
-        def plus_one(a):
-            return a + 1
-
-        tf = transformation_function.TransformationFunction(
-            hopsworks_udf=plus_one, featurestore_id=99
-        )
-
-        td = training_dataset.TrainingDataset(
-            name="test",
-            version=1,
-            data_format="CSV",
-            featurestore_id=feature_store_id,
-            splits={},
-            label=["f", "f_wrong"],
-            transformation_functions=tf,
-        )
-
-        td_engine = training_dataset_engine.TrainingDatasetEngine(feature_store_id)
-
-        f = training_dataset_feature.TrainingDatasetFeature(
-            name="f", type="str", label=False
-        )
-        f1 = training_dataset_feature.TrainingDatasetFeature(
-            name="f1", type="int", label=False
-        )
-
-        features = [f, f1]
-
-        mock_engine_get_instance.return_value.parse_schema_training_dataset.return_value = features
-
-        # Act
-        with pytest.raises(ValueError) as e_info:
-            td_engine.save(training_dataset=td, features=None, user_write_options=None)
-
-        # Assert
-        assert mock_td_api.return_value.post.call_count == 0
-        assert len(td._features) == 2
-        assert td._features[0].label is True
-        assert td._features[1].label is False
-        assert (
-            str(e_info.value)
-            == "Transformation functions can only be applied to training datasets generated from Query object"
-        )
-
     def test_save_splits(self, mocker):
         # Arrange
         feature_store_id = 99
 
         mocker.patch("hsfs.client.get_instance")
-        mocker.patch(
-            "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
-        )
+
         mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
         mock_td_api = mocker.patch("hsfs.core.training_dataset_api.TrainingDatasetApi")
         mock_warning = mocker.patch("warnings.warn")
@@ -209,8 +142,7 @@ def test_save_splits(self, mocker):
         assert (
             mock_warning.call_args[0][0]
             == "Training dataset splits were defined but no `train_split` (the name of the split that is going to be "
-            "used for training) was provided. Setting this property to `train`. The statistics of this "
-            "split will be used for transformation functions."
+            "used for training) was provided. Setting this property to `train`. "
         )
 
     def test_insert(self, mocker):
diff --git a/python/tests/core/test_transformation_function_engine.py b/python/tests/core/test_transformation_function_engine.py
index 11cd593cc3..e56e820d87 100644
--- a/python/tests/core/test_transformation_function_engine.py
+++ b/python/tests/core/test_transformation_function_engine.py
@@ -24,7 +24,7 @@
     transformation_function,
 )
 from hsfs.core import transformation_function_engine
-from hsfs.hopsworks_udf import udf
+from hsfs.hopsworks_udf import UDFType, udf
 
 
 fg1 = feature_group.FeatureGroup(
@@ -98,6 +98,7 @@ def testFunction(col1):
         tf = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         # Act
@@ -125,6 +126,7 @@ def testFunction1(col1):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         @udf(float)
@@ -134,6 +136,7 @@ def testFunction2(data2, statistics_data2):
         tf2 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction2,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         transformations = [tf1, tf2]
@@ -166,6 +169,7 @@ def testFunction1(col1):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         @udf(float)
@@ -175,6 +179,7 @@ def testFunction2(data2, statistics_data2):
         tf2 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction2,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         transformations = [tf1, tf2]
@@ -207,6 +212,7 @@ def testFunction1(col1):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         # Act
@@ -266,6 +272,7 @@ def testFunction1(col1):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         fg1 = feature_group.FeatureGroup(
@@ -325,6 +332,7 @@ def testFunction1(col1):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         fg1 = feature_group.FeatureGroup(
@@ -383,6 +391,7 @@ def testFunction1(col1):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         fg1 = feature_group.FeatureGroup(
@@ -439,6 +448,7 @@ def testFunction1(col1, statistics=stats):
         tf1 = transformation_function.TransformationFunction(
             feature_store_id,
             hopsworks_udf=testFunction1,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         fg1 = feature_group.FeatureGroup(
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index 07958686de..c1ac202fba 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -2214,6 +2214,7 @@ def test_get_training_data(self, mocker):
         mocker.patch(
             "hsfs.core.transformation_function_engine.TransformationFunctionEngine"
         )
+        mock_feature_view = mocker.patch("hsfs.feature_view.FeatureView")
 
         python_engine = python.Engine()
 
@@ -2230,7 +2231,7 @@ def test_get_training_data(self, mocker):
         # Act
         python_engine.get_training_data(
             training_dataset_obj=td,
-            feature_view_obj=None,
+            feature_view_obj=mock_feature_view,
             query_obj=mocker.Mock(),
             read_options=None,
             dataframe_type="default",
@@ -2964,6 +2965,7 @@ def test_write_training_dataset(self, mocker):
 
     def test_write_training_dataset_query_td(self, mocker, backend_fixtures):
         # Arrange
+        mocker.patch("hsfs.client.get_instance")
         mocker.patch("hsfs.engine.get_type")
         mocker.patch("hsfs.core.training_dataset_job_conf.TrainingDatasetJobConf")
         mock_job = mocker.patch("hsfs.core.job.Job")
@@ -3008,6 +3010,7 @@ def test_write_training_dataset_query_td(self, mocker, backend_fixtures):
 
     def test_write_training_dataset_query_fv(self, mocker, backend_fixtures):
         # Arrange
+        mocker.patch("hsfs.client.get_instance")
         mocker.patch("hsfs.engine.get_type")
         mocker.patch("hsfs.core.training_dataset_job_conf.TrainingDatasetJobConf")
         mock_job = mocker.patch("hsfs.core.job.Job")
@@ -3280,7 +3283,7 @@ def test_apply_transformation_function_multiple_output(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
-        @udf([int, int])
+        @udf([int, int], drop=["col1"])
         def plus_two(col1):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2})
 
@@ -3324,7 +3327,7 @@ def test_apply_transformation_function_multiple_input_output(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
-        @udf([int, int])
+        @udf([int, int], drop=["col1", "col2"])
         def plus_two(col1, col2):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
 
@@ -3354,6 +3357,7 @@ def plus_two(col1, col2):
         )
 
         # Assert
+        print(result.columns)
         assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"])
         assert len(result) == 2
         assert result["plus_two_col1_col2_0"][0] == 2
diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
index cb1a0652b5..71bb48cd05 100644
--- a/python/tests/engine/test_python_spark_transformation_functions.py
+++ b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -31,7 +31,7 @@
 from hsfs.client.exceptions import FeatureStoreException
 from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
 from hsfs.engine import python, spark
-from hsfs.hopsworks_udf import HopsworksUdf, udf
+from hsfs.hopsworks_udf import HopsworksUdf, UDFType, udf
 from pyspark.sql.types import (
     BooleanType,
     DateType,
@@ -161,6 +161,7 @@ def test_apply_builtin_minmax_from_backend(self, mocker):
             "transformationFeatures": [],
             "statisticsArgumentNames": ["feature"],
             "name": "min_max_scaler",
+            "droppedFeatures": ["feature"],
         }
 
         tf_fun = HopsworksUdf.from_response_json(udf_response)
@@ -169,7 +170,9 @@ def test_apply_builtin_minmax_from_backend(self, mocker):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun("col_0"), featurestore_id=99
+                hopsworks_udf=tf_fun("col_0"),
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -230,7 +233,9 @@ def test_apply_builtin_minmax(self, mocker):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=min_max_scaler("col_0"), featurestore_id=99
+                hopsworks_udf=min_max_scaler("col_0"),
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -299,6 +304,7 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker):
             "transformationFeatures": [],
             "statisticsArgumentNames": ["feature"],
             "name": "standard_scaler",
+            "droppedFeatures": ["feature"],
         }
 
         tf_fun = HopsworksUdf.from_response_json(udf_response)
@@ -307,7 +313,9 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun("col_0"), featurestore_id=99
+                hopsworks_udf=tf_fun("col_0"),
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
         mean = statistics.mean([1, 2])
@@ -369,7 +377,9 @@ def test_apply_builtin_standard_scaler(self, mocker):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=standard_scaler("col_0"), featurestore_id=99
+                hopsworks_udf=standard_scaler("col_0"),
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -441,6 +451,7 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker):
             "transformationFeatures": [],
             "statisticsArgumentNames": ["feature"],
             "name": "robust_scaler",
+            "droppedFeatures": ["feature"],
         }
 
         tf_fun = HopsworksUdf.from_response_json(udf_response)
@@ -449,7 +460,9 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun("col_0"), featurestore_id=99
+                hopsworks_udf=tf_fun("col_0"),
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
         percentiles = [1] * 100
@@ -513,7 +526,9 @@ def test_apply_builtin_robust_scaler(self, mocker):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=robust_scaler("col_0"), featurestore_id=99
+                hopsworks_udf=robust_scaler("col_0"),
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -571,7 +586,7 @@ def test_apply_plus_one_int(self, mocker):
         )
 
         # Arrange
-        @udf(int)
+        @udf(int, drop=["col_0"])
         def tf_fun(col_0):
             return col_0 + 1
 
@@ -579,7 +594,9 @@ def tf_fun(col_0):
 
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -629,14 +646,16 @@ def test_apply_plus_one_str(self, mocker):
         )
 
         # Arrange
-        @udf(str)
+        @udf(str, drop="col_0")
         def tf_fun(col_0):
             return col_0 + "1"
 
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -686,14 +705,16 @@ def test_apply_plus_one_double(self, mocker):
         spark_df = spark_engine._spark_session.createDataFrame(df, schema=schema)
 
         # Arrange
-        @udf(float)
+        @udf(float, drop="col_0")
         def tf_fun(col_0):
             return col_0 + 1.0
 
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -758,7 +779,7 @@ def test_apply_plus_one_datetime_no_tz(self, mocker):
         )
 
         # Arrange
-        @udf(datetime.datetime)
+        @udf(datetime.datetime, drop="col_0")
         def tf_fun(col_0):
             import datetime
 
@@ -767,7 +788,9 @@ def tf_fun(col_0):
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -833,7 +856,7 @@ def test_apply_plus_one_datetime_tz_utc(self, mocker):
         )
 
         # Arrange
-        @udf(datetime.datetime)
+        @udf(datetime.datetime, drop="col_0")
         def tf_fun(col_0) -> datetime.datetime:
             import datetime
 
@@ -844,7 +867,9 @@ def tf_fun(col_0) -> datetime.datetime:
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -911,7 +936,7 @@ def test_apply_plus_one_datetime_tz_pst(self, mocker):
         )
 
         # Arrange
-        @udf(datetime.datetime)
+        @udf(datetime.datetime, drop="col_0")
         def tf_fun(col_0) -> datetime.datetime:
             import datetime
 
@@ -923,7 +948,9 @@ def tf_fun(col_0) -> datetime.datetime:
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -989,7 +1016,7 @@ def test_apply_plus_one_datetime_ts_none(self, mocker):
         )
 
         # Arrange
-        @udf(datetime.datetime)
+        @udf(datetime.datetime, drop=["col_0"])
         def tf_fun(col_0) -> datetime.datetime:
             import datetime
 
@@ -1003,7 +1030,9 @@ def tf_fun(col_0) -> datetime.datetime:
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -1063,7 +1092,7 @@ def test_apply_plus_one_date(self, mocker):
         )
 
         # Arrange
-        @udf(datetime.date)
+        @udf(datetime.date, drop=["col_0"])
         def tf_fun(col_0):
             import datetime
 
@@ -1072,7 +1101,9 @@ def tf_fun(col_0):
         td = self._create_training_dataset()
         transformation_functions = [
             transformation_function.TransformationFunction(
-                hopsworks_udf=tf_fun, featurestore_id=99
+                hopsworks_udf=tf_fun,
+                featurestore_id=99,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
         ]
 
@@ -1089,7 +1120,7 @@ def test_apply_plus_one_invalid_type(self, mocker):
         # Arrange
         with pytest.raises(FeatureStoreException) as e_info:
 
-            @udf(list)
+            @udf(list, drop="a")
             def tf_fun(a):
                 return a + 1
 
diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
index 7eabd38d07..0de616084a 100644
--- a/python/tests/engine/test_spark.py
+++ b/python/tests/engine/test_spark.py
@@ -34,7 +34,7 @@
 from hsfs.constructor import hudi_feature_group_alias, query
 from hsfs.core import training_dataset_engine
 from hsfs.engine import spark
-from hsfs.hopsworks_udf import udf
+from hsfs.hopsworks_udf import UDFType, udf
 from hsfs.training_dataset_feature import TrainingDatasetFeature
 from pyspark.sql import DataFrame
 from pyspark.sql.types import (
@@ -2675,6 +2675,7 @@ def plus_one(col1):
         tf = transformation_function.TransformationFunction(
             featurestore_id=99,
             hopsworks_udf=plus_one,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         f = training_dataset_feature.TrainingDatasetFeature(
@@ -2724,6 +2725,7 @@ def plus_one(col1):
         tf = transformation_function.TransformationFunction(
             featurestore_id=99,
             hopsworks_udf=plus_one,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         transformation_fn_dict = dict()
@@ -4328,13 +4330,12 @@ def test_apply_transformation_function_single_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        @udf(int)
+        @udf(int, drop=["col1"])
         def plus_one(col1):
             return col1 + 1
 
         tf = transformation_function.TransformationFunction(
-            99,
-            hopsworks_udf=plus_one,
+            99, hopsworks_udf=plus_one, transformation_type=UDFType.MODEL_DEPENDENT
         )
 
         f = feature.Feature(name="col_0", type=IntegerType(), index=0)
@@ -4388,13 +4389,12 @@ def test_apply_transformation_function_multiple_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        @udf([int, int])
+        @udf([int, int], drop=["col1"])
         def plus_two(col1):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col1 + 2})
 
         tf = transformation_function.TransformationFunction(
-            99,
-            hopsworks_udf=plus_two,
+            99, hopsworks_udf=plus_two, transformation_type=UDFType.MODEL_DEPENDENT
         )
 
         f = feature.Feature(name="col_0", type=IntegerType(), index=0)
@@ -4449,13 +4449,12 @@ def test_apply_transformation_function_multiple_input_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
-        @udf([int, int])
+        @udf([int, int], drop=["col1", "col2"])
         def test(col1, col2):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
 
         tf = transformation_function.TransformationFunction(
-            99,
-            hopsworks_udf=test,
+            99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT
         )
 
         f = feature.Feature(name="col_0", type=IntegerType(), index=0)
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index 5e229955bd..1ad25dea36 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -934,7 +934,8 @@
                 "name": "add_mean_fs",
                 "outputTypes":["double"],
                 "transformationFeatures":["data"],
-                "statisticsArgumentNames":["data1"]
+                "statisticsArgumentNames":["data1"],
+                "dropped_features":["data1"]
               }
             },
             {
@@ -945,7 +946,8 @@
                 "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
                 "name": "add_one_fs",
                 "outputTypes":["double"],
-                "transformationFeatures":["col1"]
+                "transformationFeatures":["col1"],
+                "dropped_features":["data1"]
               }
             }
         ],
diff --git a/python/tests/fixtures/training_dataset_fixtures.json b/python/tests/fixtures/training_dataset_fixtures.json
index ea3f356e68..6db5d08325 100644
--- a/python/tests/fixtures/training_dataset_fixtures.json
+++ b/python/tests/fixtures/training_dataset_fixtures.json
@@ -122,21 +122,12 @@
               "items": [
                 {
                   "featurestore_id": 11,
-                  "transformation_fn": "test_transformation_fn",
                   "version": 1,
                   "name": "test_name",
-                  "source_code_content": "test_source_code_content",
-                  "builtin_source_code": "test_builtin_source_code",
-                  "output_type": "test_output_type",
-                  "id": 43,
-                  "type": "transformationFunctionTDO",
-                  "items": [],
-                  "count": 0,
                   "href": "test_href"
                 }
               ]
-            },
-            "transformation_function": "test_transformation_function"
+            }
           }
         ],
         "statistics_config": {
@@ -153,7 +144,6 @@
         "from_query": "test_from_query",
         "querydto": "test_querydto",
         "label": "test_label",
-        "transformation_functions": "test_transformation_functions",
         "train_split": "test_train_split",
         "time_split_size": "test_time_split_size",
         "type": "trainingDatasetDTO"
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 6fa5d762b7..036eb2fac7 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -8,7 +8,8 @@
         "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
         "name": "add_one_fs",
         "outputTypes":["double"],
-        "transformationFeatures":["col1"]
+        "transformationFeatures":["col1"],
+        "dropped_features":["data1"]
       }
     }
   },
@@ -22,7 +23,8 @@
         "name": "add_mean_fs",
         "outputTypes":["double"],
         "transformationFeatures":["data"],
-        "statisticsArgumentNames":["data1"]
+        "statisticsArgumentNames":["data1"],
+        "dropped_features":["data1"]
       }
     }
   },
@@ -36,7 +38,8 @@
         "name": "test_func",
         "outputTypes":["string"],
         "transformationFeatures":["feature1", "feature2", "feature3"],
-        "statisticsArgumentNames":["data1", "data2"]
+        "statisticsArgumentNames":["data1", "data2"],
+        "dropped_features":["data1", "data2", "data3"]
       }
     }
   },
@@ -50,7 +53,8 @@
         "name": "test_func",
         "outputTypes":["string", "double"],
         "transformationFeatures":["feature1", "feature2", "feature3"],
-        "statisticsArgumentNames":["data1", "data2"]
+        "statisticsArgumentNames":["data1", "data2"],
+        "dropped_features":["data1", "data2", "data3"]
       }
     }
   },
@@ -67,7 +71,8 @@
             "name": "add_mean_fs",
             "outputTypes":["double"],
             "transformationFeatures":["data"],
-            "statisticsArgumentNames":["data1"]
+            "statisticsArgumentNames":["data1"],
+            "dropped_features":["data1"]
           }
         },
         {
@@ -78,7 +83,8 @@
             "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
             "name": "add_one_fs",
             "outputTypes":["double"],
-            "transformationFeatures":["col1"]
+            "transformationFeatures":["col1"],
+            "dropped_features":["data1"]
           }
         }
       ]
@@ -97,7 +103,8 @@
             "name": "add_mean_fs",
             "outputTypes":["double"],
             "transformationFeatures":["data"],
-            "statisticsArgumentNames":["data1"]
+            "statisticsArgumentNames":["data1"],
+            "dropped_features":["data1"]
           }
         }
       ]
diff --git a/python/tests/test_builtin_transformation.py b/python/tests/test_builtin_transformation.py
new file mode 100644
index 0000000000..4a8a01af9c
--- /dev/null
+++ b/python/tests/test_builtin_transformation.py
@@ -0,0 +1,81 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+import hsfs.engine as engine
+import pandas as pd
+from hsfs.builtin_transformations import (
+    min_max_scaler,
+)
+from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
+from hsfs.engine import python
+from hsfs.hopsworks_udf import UDFType
+
+
+class TestBuiltinTransformations:
+    @staticmethod
+    def validate_transformations_python(
+        transformed_outputs, expected_output, expected_col_names
+    ):
+        if isinstance(transformed_outputs, pd.Series):
+            assert transformed_outputs.name == expected_col_names
+        else:
+            assert all(transformed_outputs.columns == expected_col_names)
+        assert all(transformed_outputs.values == expected_output.values)
+
+    def test_min_max_scaler(self):
+        test_dataframe = pd.DataFrame(
+            {
+                "col1": [1, 2, 3, 4],
+                "col2": [1.2, 3.4, 5.6, 9.1],
+            }
+        )
+        statistics_df = test_dataframe.describe().to_dict()
+
+        # Test case 1 : Integer column
+        min_max_scaler_col1 = min_max_scaler("col1")
+        min_max_scaler_col1.udf_type = UDFType.MODEL_DEPENDENT
+
+        min_max_scaler_col1.transformation_statistics = [
+            FeatureDescriptiveStatistics(
+                feature_name="col1",
+                min=statistics_df["col1"]["min"],
+                max=statistics_df["col1"]["max"],
+            )
+        ]
+
+        expected_df = (test_dataframe["col1"] - test_dataframe["col1"].min()) / (
+            test_dataframe["col1"].max() - test_dataframe["col1"].min()
+        )
+
+        # Test with python engine
+        engine.set_instance(engine=python.Engine(), engine_type="python")
+
+        transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"])
+        TestBuiltinTransformations.validate_transformations_python(
+            transformed_outputs=transformed_df,
+            expected_output=expected_df,
+            expected_col_names="min_max_scaler_col1_",
+        )
+
+        # Test with spark engine
+        engine.set_instance(engine=python.Engine(), engine_type="python")
+
+        transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"])
+        TestBuiltinTransformations.validate_transformations_python(
+            transformed_outputs=transformed_df,
+            expected_output=expected_df,
+            expected_col_names="min_max_scaler_col1_",
+        )
diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
index 6595207ed3..fe9531b751 100644
--- a/python/tests/test_hopswork_udf.py
+++ b/python/tests/test_hopswork_udf.py
@@ -342,7 +342,7 @@ def test_func(col1, col2, col3):
             return col1 + 1
 
         test_func.udf_type = UDFType.MODEL_DEPENDENT
-        assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"]
+        assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"]
         test_func.udf_type = UDFType.ON_DEMAND
         assert test_func._get_output_column_names() == ["test_func"]
 
@@ -369,9 +369,9 @@ def test_func(col1, col2, col3):
 
         test_func.udf_type = UDFType.MODEL_DEPENDENT
         assert test_func._get_output_column_names() == [
-            "test_func_col1-col2-col3_0",
-            "test_func_col1-col2-col3_1",
-            "test_func_col1-col2-col3_2",
+            "test_func_col1_col2_col3_0",
+            "test_func_col1_col2_col3_1",
+            "test_func_col1_col2_col3_2",
         ]
 
     def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
@@ -446,7 +446,7 @@ def test_func(col1, col2):
             test_dataframe["column1"], test_dataframe["column2"]
         )
 
-        assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"])
+        assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"])
         assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]]
 
     def test_HopsworkUDf_call_one_argument(self):
diff --git a/python/tests/test_training_dataset.py b/python/tests/test_training_dataset.py
index 416f3cb860..be771406b2 100644
--- a/python/tests/test_training_dataset.py
+++ b/python/tests/test_training_dataset.py
@@ -57,7 +57,6 @@ def test_from_response_json(self, mocker, backend_fixtures):
         assert td._from_query == "test_from_query"
         assert td._querydto == "test_querydto"
         assert td.feature_store_id == 22
-        assert td.transformation_functions == "test_transformation_functions"
         assert td.train_split == "test_train_split"
         assert td.training_dataset_type == "HOPSFS_TRAINING_DATASET"
         assert isinstance(td.storage_connector, storage_connector.JdbcConnector)
@@ -102,7 +101,6 @@ def test_from_response_json_basic_info(self, mocker, backend_fixtures):
         assert td._from_query is None
         assert td._querydto is None
         assert td.feature_store_id == 22
-        assert td.transformation_functions is None
         assert td.train_split is None
         assert td.training_dataset_type is None
         assert isinstance(td.storage_connector, storage_connector.JdbcConnector)
diff --git a/python/tests/test_transformation_function.py b/python/tests/test_transformation_function.py
index bfc2f125d0..0b83832755 100644
--- a/python/tests/test_transformation_function.py
+++ b/python/tests/test_transformation_function.py
@@ -17,7 +17,7 @@
 
 import pytest
 from hsfs.client.exceptions import FeatureStoreException
-from hsfs.hopsworks_udf import udf
+from hsfs.hopsworks_udf import UDFType, udf
 from hsfs.transformation_function import TransformationFunction
 
 
@@ -27,7 +27,7 @@ def test_from_response_json_one_argument_no_statistics(self, backend_fixtures):
         json = backend_fixtures["transformation_function"][
             "get_one_argument_no_statistics_function"
         ]["response"]
-
+        json["transformation_type"] = UDFType.MODEL_DEPENDENT
         # Act
         tf = TransformationFunction.from_response_json(json)
 
@@ -51,6 +51,7 @@ def test_from_response_json_one_argument_with_statistics(self, backend_fixtures)
         json = backend_fixtures["transformation_function"][
             "get_one_argument_with_statistics_function"
         ]["response"]
+        json["transformation_type"] = UDFType.MODEL_DEPENDENT
 
         # Act
         tf = TransformationFunction.from_response_json(json)
@@ -77,6 +78,7 @@ def test_from_response_json_multiple_argument_with_statistics(
         json = backend_fixtures["transformation_function"][
             "get_multiple_argument_with_statistics_function"
         ]["response"]
+        json["transformation_type"] = UDFType.MODEL_DEPENDENT
 
         # Act
         tf = TransformationFunction.from_response_json(json)
@@ -105,6 +107,7 @@ def test_from_response_json_multiple_return_type_functions(self, backend_fixture
         json = backend_fixtures["transformation_function"][
             "get_multiple_return_type_functions"
         ]["response"]
+        json["transformation_type"] = UDFType.MODEL_DEPENDENT
 
         # Act
         tf = TransformationFunction.from_response_json(json)
@@ -141,6 +144,8 @@ def test_from_response_json_list_empty(self, backend_fixtures):
     def test_from_response_json_list(self, backend_fixtures):
         # Arrange
         json = backend_fixtures["transformation_function"]["get_list"]["response"]
+        for response_json in json["items"]:
+            response_json["transformation_type"] = UDFType.MODEL_DEPENDENT
 
         # Act
         tf_list = TransformationFunction.from_response_json(json)
@@ -182,6 +187,8 @@ def test_from_response_json_list_one_argument(self, backend_fixtures):
         json = backend_fixtures["transformation_function"]["get_list_one_argument"][
             "response"
         ]
+        for response_json in json["items"]:
+            response_json["transformation_type"] = UDFType.MODEL_DEPENDENT
 
         # Act
         tf = TransformationFunction.from_response_json(json)
@@ -210,6 +217,7 @@ def test(col1):
             TransformationFunction(
                 featurestore_id=10,
                 hopsworks_udf=test,
+                transformation_type=UDFType.MODEL_DEPENDENT,
             )
 
         assert (
@@ -225,6 +233,7 @@ def test2(col1):
         tf = TransformationFunction(
             featurestore_id=10,
             hopsworks_udf=test2,
+            transformation_type=UDFType.MODEL_DEPENDENT,
         )
 
         assert tf.hopsworks_udf == test2

From 202358db28a26cc9889d8f7e982187b9504e0914 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 4 Jul 2024 11:26:08 +0200
Subject: [PATCH 55/58] adding unit tests for on-demand transformation
 functions

---
 python/hsfs/hopsworks_udf.py                  |   3 +-
 python/tests/engine/test_python.py            | 217 +++++++++-
 python/tests/engine/test_spark.py             | 373 +++++++++++++++++-
 python/tests/fixtures/feature_fixtures.json   |  16 +
 .../fixtures/feature_group_fixtures.json      | 122 ++++++
 .../training_dataset_feature_fixtures.json    |  78 ++++
 python/tests/test_builtin_transformation.py   |  81 ----
 python/tests/test_feature.py                  |  20 +
 python/tests/test_feature_group.py            |  59 ++-
 python/tests/test_feature_view.py             |  11 +-
 python/tests/test_hopswork_udf.py             | 283 +++++++++++++
 python/tests/test_training_dataset_feature.py |  35 ++
 12 files changed, 1209 insertions(+), 89 deletions(-)
 delete mode 100644 python/tests/test_builtin_transformation.py

diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index a17e432009..10e8135293 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -206,7 +206,8 @@ def _validate_and_convert_drop_features(
 
         missing_drop_features = []
         for dropped_feature in dropped_features:
-            if feature_name_prefix + dropped_feature not in transformation_feature:
+            dropped_feature = feature_name_prefix + dropped_feature
+            if dropped_feature not in transformation_feature:
                 missing_drop_features.append(dropped_feature)
 
         if missing_drop_features:
diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
index c1ac202fba..cbbe190c4d 100644
--- a/python/tests/engine/test_python.py
+++ b/python/tests/engine/test_python.py
@@ -36,8 +36,9 @@
 from hsfs.constructor.hudi_feature_group_alias import HudiFeatureGroupAlias
 from hsfs.core import inode, job
 from hsfs.engine import python
-from hsfs.hopsworks_udf import udf
+from hsfs.hopsworks_udf import UDFType, udf
 from hsfs.training_dataset_feature import TrainingDatasetFeature
+from hsfs.transformation_function import TransformationFunction
 from polars.testing import assert_frame_equal as polars_assert_frame_equal
 
 
@@ -1460,7 +1461,6 @@ def test_parse_schema_feature_group_polars(self, mocker):
         result = python_engine.parse_schema_feature_group(
             dataframe=df, time_travel_format=None
         )
-        print(result)
 
         # Assert
         assert len(result) == 3
@@ -1468,6 +1468,71 @@ def test_parse_schema_feature_group_polars(self, mocker):
         assert result[1].name == "col2"
         assert result[2].name == "date"
 
+    def test_parse_schema_feature_group_transformation_functions(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.engine.python.Engine._convert_pandas_dtype_to_offline_type")
+
+        python_engine = python.Engine()
+
+        d = {"Col1": [1, 2], "col2": [3, 4]}
+        df = pd.DataFrame(data=d)
+
+        @udf(int)
+        def test(feature):
+            return feature + 1
+
+        transformation_function = TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test,
+            version=1,
+            transformation_type=UDFType.ON_DEMAND,
+        )
+
+        # Act
+        result = python_engine.parse_schema_feature_group(
+            dataframe=df,
+            time_travel_format=None,
+            transformation_functions=[transformation_function],
+        )
+
+        # Assert
+        assert len(result) == 3
+        assert result[0].name == "col1"
+        assert result[1].name == "col2"
+        assert result[2].name == "test"
+
+    def test_parse_schema_feature_group_transformation_functions_drop(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.engine.python.Engine._convert_pandas_dtype_to_offline_type")
+
+        python_engine = python.Engine()
+
+        d = {"Col1": [1, 2], "col2": [3, 4]}
+        df = pd.DataFrame(data=d)
+
+        @udf(int, drop="feature")
+        def test(feature):
+            return feature + 1
+
+        transformation_function = TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test("col2"),
+            version=1,
+            transformation_type=UDFType.ON_DEMAND,
+        )
+
+        # Act
+        result = python_engine.parse_schema_feature_group(
+            dataframe=df,
+            time_travel_format=None,
+            transformation_functions=[transformation_function],
+        )
+
+        # Assert
+        assert len(result) == 2
+        assert result[0].name == "col1"
+        assert result[1].name == "test"
+
     def test_parse_schema_training_dataset(self):
         # Arrange
         python_engine = python.Engine()
@@ -2136,6 +2201,52 @@ def test_save_dataframe(self, mocker):
         assert mock_python_engine_write_dataframe_kafka.call_count == 0
         assert mock_python_engine_legacy_save_dataframe.call_count == 1
 
+    def test_save_dataframe_transformation_functions(self, mocker):
+        # Arrange
+        mock_python_engine_write_dataframe_kafka = mocker.patch(
+            "hsfs.engine.python.Engine._write_dataframe_kafka"
+        )
+        mock_python_engine_legacy_save_dataframe = mocker.patch(
+            "hsfs.engine.python.Engine.legacy_save_dataframe"
+        )
+        mock_python_engine_apply_transformations = mocker.patch(
+            "hsfs.engine.python.Engine._apply_transformation_function"
+        )
+
+        python_engine = python.Engine()
+
+        @udf(int)
+        def test(feature):
+            return feature + 1
+
+        fg = feature_group.FeatureGroup(
+            name="test",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            id=10,
+            stream=False,
+            transformation_functions=[test],
+        )
+
+        # Act
+        python_engine.save_dataframe(
+            feature_group=fg,
+            dataframe=None,
+            operation=None,
+            online_enabled=None,
+            storage=None,
+            offline_write_options=None,
+            online_write_options=None,
+            validation_id=None,
+        )
+
+        # Assert
+        assert mock_python_engine_write_dataframe_kafka.call_count == 0
+        assert mock_python_engine_legacy_save_dataframe.call_count == 1
+        assert mock_python_engine_apply_transformations.call_count == 1
+
     def test_save_dataframe_stream(self, mocker):
         # Arrange
         mock_python_engine_write_dataframe_kafka = mocker.patch(
@@ -3327,6 +3438,57 @@ def test_apply_transformation_function_multiple_input_output(self, mocker):
         engine._engine_type = "python"
         python_engine = python.Engine()
 
+        @udf([int, int])
+        def plus_two(col1, col2):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
+
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[plus_two],
+        )
+
+        df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]})
+
+        # Act
+        result = python_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions, dataset=df
+        )
+
+        # Assert
+        assert all(
+            result.columns
+            == ["col1", "col2", "plus_two_col1_col2_0", "plus_two_col1_col2_1"]
+        )
+        assert len(result) == 2
+        assert result["col1"][0] == 1
+        assert result["col1"][1] == 2
+        assert result["col2"][0] == 10
+        assert result["col2"][1] == 11
+        assert result["plus_two_col1_col2_0"][0] == 2
+        assert result["plus_two_col1_col2_0"][1] == 3
+        assert result["plus_two_col1_col2_1"][0] == 12
+        assert result["plus_two_col1_col2_1"][1] == 13
+
+    def test_apply_transformation_function_multiple_input_output_drop_all(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+
+        engine._engine_type = "python"
+        python_engine = python.Engine()
+
         @udf([int, int], drop=["col1", "col2"])
         def plus_two(col1, col2):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
@@ -3357,7 +3519,6 @@ def plus_two(col1, col2):
         )
 
         # Assert
-        print(result.columns)
         assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"])
         assert len(result) == 2
         assert result["plus_two_col1_col2_0"][0] == 2
@@ -3365,6 +3526,56 @@ def plus_two(col1, col2):
         assert result["plus_two_col1_col2_1"][0] == 12
         assert result["plus_two_col1_col2_1"][1] == 13
 
+    def test_apply_transformation_function_multiple_input_output_drop_some(
+        self, mocker
+    ):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+
+        engine._engine_type = "python"
+        python_engine = python.Engine()
+
+        @udf([int, int], drop=["col1"])
+        def plus_two(col1, col2):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
+
+        fg = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=[feature.Feature("id"), feature.Feature("tf_name")],
+            id=11,
+            stream=False,
+        )
+
+        fv = feature_view.FeatureView(
+            name="fv_name",
+            query=fg.select_all(),
+            featurestore_id=99,
+            transformation_functions=[plus_two],
+        )
+
+        df = pd.DataFrame(data={"col1": [1, 2], "col2": [10, 11]})
+
+        # Act
+        result = python_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions, dataset=df
+        )
+
+        # Assert
+        assert all(
+            result.columns == ["col2", "plus_two_col1_col2_0", "plus_two_col1_col2_1"]
+        )
+        assert len(result) == 2
+        assert result["col2"][0] == 10
+        assert result["col2"][1] == 11
+        assert result["plus_two_col1_col2_0"][0] == 2
+        assert result["plus_two_col1_col2_0"][1] == 3
+        assert result["plus_two_col1_col2_1"][0] == 12
+        assert result["plus_two_col1_col2_1"][1] == 13
+
     def test_apply_transformation_function_polars(self, mocker):
         # Arrange
         mocker.patch("hsfs.client.get_instance")
diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
index 0de616084a..5e31959ef4 100644
--- a/python/tests/engine/test_spark.py
+++ b/python/tests/engine/test_spark.py
@@ -469,7 +469,6 @@ def test_convert_to_default_dataframe_pyspark_rdd(self):
 
         # Assert
         result_df = result.toPandas()
-        print(result_df)
         assert list(result_df) == list(expected)
         for column in list(result_df):
             assert result_df[column].equals(result_df[column])
@@ -644,6 +643,51 @@ def test_save_dataframe(self, mocker):
         assert mock_spark_engine_save_online_dataframe.call_count == 0
         assert mock_spark_engine_save_offline_dataframe.call_count == 1
 
+    def test_save_dataframe_transformations(self, mocker):
+        # Arrange
+        mock_spark_engine_save_online_dataframe = mocker.patch(
+            "hsfs.engine.spark.Engine._save_online_dataframe"
+        )
+        mock_spark_engine_save_offline_dataframe = mocker.patch(
+            "hsfs.engine.spark.Engine._save_offline_dataframe"
+        )
+        mock_spark_engine_apply_transformations = mocker.patch(
+            "hsfs.engine.spark.Engine._apply_transformation_function"
+        )
+
+        spark_engine = spark.Engine()
+
+        @udf(int)
+        def test(feature):
+            return feature + 1
+
+        fg = feature_group.FeatureGroup(
+            name="test",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            id=10,
+            transformation_functions=[test],
+        )
+
+        # Act
+        spark_engine.save_dataframe(
+            feature_group=fg,
+            dataframe=None,
+            operation=None,
+            online_enabled=None,
+            storage=None,
+            offline_write_options=None,
+            online_write_options=None,
+            validation_id=None,
+        )
+
+        # Assert
+        assert mock_spark_engine_save_online_dataframe.call_count == 0
+        assert mock_spark_engine_save_offline_dataframe.call_count == 1
+        assert mock_spark_engine_apply_transformations.call_count == 1
+
     def test_save_dataframe_storage_offline(self, mocker):
         # Arrange
         mock_spark_engine_save_online_dataframe = mocker.patch(
@@ -979,6 +1023,135 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures):
             == 0
         )
 
+    def test_save_stream_dataframe_transformations(self, mocker, backend_fixtures):
+        # Arrange
+        mock_client_get_instance = mocker.patch("hsfs.client.get_instance")
+        mocker.patch("hsfs.engine.spark.Engine._encode_complex_features")
+        mock_spark_engine_online_fg_to_avro = mocker.patch(
+            "hsfs.engine.spark.Engine._online_fg_to_avro"
+        )
+
+        mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance")
+        mock_engine_get_instance.return_value.add_file.return_value = (
+            "result_from_add_file"
+        )
+
+        mock_storage_connector_api = mocker.patch(
+            "hsfs.core.storage_connector_api.StorageConnectorApi"
+        )
+
+        mock_spark_engine_apply_transformations = mocker.patch(
+            "hsfs.engine.spark.Engine._apply_transformation_function"
+        )
+
+        json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"]
+        sc = storage_connector.StorageConnector.from_response_json(json)
+        mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc
+
+        spark_engine = spark.Engine()
+
+        @udf(int)
+        def test(feature):
+            return feature + 1
+
+        fg = feature_group.FeatureGroup(
+            name="test",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            id=10,
+            online_topic_name="test_online_topic_name",
+            transformation_functions=[test],
+        )
+        fg.feature_store = mocker.Mock()
+        project_id = 1
+        fg.feature_store.project_id = project_id
+
+        mock_client_get_instance.return_value._project_name = "test_project_name"
+
+        # Act
+        spark_engine.save_stream_dataframe(
+            feature_group=fg,
+            dataframe=None,
+            query_name=None,
+            output_mode="test_mode",
+            await_termination=None,
+            timeout=None,
+            checkpoint_dir=None,
+            write_options={"test_name": "test_value"},
+        )
+
+        # Assert
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0]
+            == "headers"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[
+                0
+            ][0]
+            == "test_mode"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[
+                0
+            ][0]
+            == "kafka"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
+                0
+            ][0]
+            == "checkpointLocation"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
+                0
+            ][1]
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
+                1
+            ]
+            == {
+                "kafka.bootstrap.servers": "test_bootstrap_servers",
+                "kafka.security.protocol": "test_security_protocol",
+                "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm",
+                "kafka.ssl.key.password": "test_ssl_key_password",
+                "kafka.ssl.keystore.location": "result_from_add_file",
+                "kafka.ssl.keystore.password": "test_ssl_keystore_password",
+                "kafka.ssl.truststore.location": "result_from_add_file",
+                "kafka.ssl.truststore.password": "test_ssl_truststore_password",
+                "kafka.test_option_name": "test_option_value",
+                "test_name": "test_value",
+            }
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[
+                0
+            ][0]
+            == "topic"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[
+                0
+            ][1]
+            == "test_online_topic_name"
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[
+                0
+            ][0]
+            == self._get_spark_query_name(project_id, fg)
+        )
+        assert (
+            mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count
+            == 0
+        )
+        assert mock_spark_engine_apply_transformations.call_count == 1
+
     def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures):
         # Arrange
         mock_client_get_instance = mocker.patch("hsfs.client.get_instance")
@@ -3711,6 +3884,81 @@ def test_parse_schema_feature_group(self, mocker):
         assert mock_spark_engine_convert_spark_type.call_count == 2
         assert mock_spark_engine_convert_spark_type.call_args[0][1] is False
 
+    def test_parse_schema_feature_group_transformations(self, mocker):
+        # Arrange
+        mock_spark_engine_convert_spark_type = mocker.patch(
+            "hsfs.engine.spark.Engine.convert_spark_type_to_offline_type"
+        )
+
+        spark_engine = spark.Engine()
+
+        d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"]}
+        df = pd.DataFrame(data=d)
+
+        @udf(int)
+        def test(feature):
+            return feature + 1
+
+        tf_function = transformation_function.TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test,
+            version=1,
+            transformation_type=UDFType.ON_DEMAND,
+        )
+
+        spark_df = spark_engine._spark_session.createDataFrame(df)
+
+        # Act
+        result = spark_engine.parse_schema_feature_group(
+            dataframe=spark_df,
+            time_travel_format=None,
+            transformation_functions=[tf_function],
+        )
+
+        # Assert
+        assert result[0].name == "col_0"
+        assert result[1].name == "col_1"
+        assert result[2].name == "test"
+        assert mock_spark_engine_convert_spark_type.call_count == 2
+        assert mock_spark_engine_convert_spark_type.call_args[0][1] is False
+
+    def test_parse_schema_feature_group_transformations_dropped(self, mocker):
+        # Arrange
+        mock_spark_engine_convert_spark_type = mocker.patch(
+            "hsfs.engine.spark.Engine.convert_spark_type_to_offline_type"
+        )
+
+        spark_engine = spark.Engine()
+
+        d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"]}
+        df = pd.DataFrame(data=d)
+
+        @udf(int, drop="feature")
+        def test(feature):
+            return feature + 1
+
+        tf_function = transformation_function.TransformationFunction(
+            featurestore_id=10,
+            hopsworks_udf=test("col_0"),
+            version=1,
+            transformation_type=UDFType.ON_DEMAND,
+        )
+
+        spark_df = spark_engine._spark_session.createDataFrame(df)
+
+        # Act
+        result = spark_engine.parse_schema_feature_group(
+            dataframe=spark_df,
+            time_travel_format=None,
+            transformation_functions=[tf_function],
+        )
+
+        # Assert
+        assert result[0].name == "col_1"
+        assert result[1].name == "test"
+        assert mock_spark_engine_convert_spark_type.call_count == 2
+        assert mock_spark_engine_convert_spark_type.call_args[0][1] is False
+
     def test_parse_schema_feature_group_hudi(self, mocker):
         # Arrange
         mock_spark_engine_convert_spark_type = mocker.patch(
@@ -4449,6 +4697,129 @@ def test_apply_transformation_function_multiple_input_output(self, mocker):
         engine._engine_type = "spark"
         spark_engine = spark.Engine()
 
+        @udf([int, int])
+        def test(col1, col2):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
+
+        tf = transformation_function.TransformationFunction(
+            99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT
+        )
+
+        f = feature.Feature(name="col_0", type=IntegerType(), index=0)
+        f1 = feature.Feature(name="col_1", type=StringType(), index=1)
+        f2 = feature.Feature(name="col_2", type=IntegerType(), index=1)
+        features = [f, f1, f2]
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=features,
+            id=11,
+            stream=False,
+        )
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=99,
+            query=fg1.select_all(),
+            transformation_functions=[tf("col_0", "col_2")],
+        )
+
+        d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]}
+        df = pd.DataFrame(data=d)
+
+        spark_df = spark_engine._spark_session.createDataFrame(df)
+
+        expected_df = pd.DataFrame(
+            data={
+                "col_0": [1, 2],
+                "col_1": ["test_1", "test_2"],
+                "col_2": [10, 11],
+                "test_col_0_col_2_0": [2, 3],
+                "test_col_0_col_2_1": [12, 13],
+            }
+        )
+
+        expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df)
+
+        # Act
+        result = spark_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions,
+            dataset=spark_df,
+        )
+        # Assert
+        assert result.schema == expected_spark_df.schema
+        assert result.collect() == expected_spark_df.collect()
+
+    def test_apply_transformation_function_multiple_input_output_drop_some(
+        self, mocker
+    ):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+        engine._engine_type = "spark"
+        spark_engine = spark.Engine()
+
+        @udf([int, int], drop=["col1"])
+        def test(col1, col2):
+            return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
+
+        tf = transformation_function.TransformationFunction(
+            99, hopsworks_udf=test, transformation_type=UDFType.MODEL_DEPENDENT
+        )
+
+        f = feature.Feature(name="col_0", type=IntegerType(), index=0)
+        f1 = feature.Feature(name="col_1", type=StringType(), index=1)
+        f2 = feature.Feature(name="col_2", type=IntegerType(), index=1)
+        features = [f, f1, f2]
+        fg1 = feature_group.FeatureGroup(
+            name="test1",
+            version=1,
+            featurestore_id=99,
+            primary_key=[],
+            partition_key=[],
+            features=features,
+            id=11,
+            stream=False,
+        )
+        fv = feature_view.FeatureView(
+            name="test",
+            featurestore_id=99,
+            query=fg1.select_all(),
+            transformation_functions=[tf("col_0", "col_2")],
+        )
+
+        d = {"col_0": [1, 2], "col_1": ["test_1", "test_2"], "col_2": [10, 11]}
+        df = pd.DataFrame(data=d)
+
+        spark_df = spark_engine._spark_session.createDataFrame(df)
+
+        expected_df = pd.DataFrame(
+            data={
+                "col_1": ["test_1", "test_2"],
+                "col_2": [10, 11],
+                "test_col_0_col_2_0": [2, 3],
+                "test_col_0_col_2_1": [12, 13],
+            }
+        )
+
+        expected_spark_df = spark_engine._spark_session.createDataFrame(expected_df)
+
+        # Act
+        result = spark_engine._apply_transformation_function(
+            transformation_functions=fv.transformation_functions,
+            dataset=spark_df,
+        )
+        # Assert
+        assert result.schema == expected_spark_df.schema
+        assert result.collect() == expected_spark_df.collect()
+
+    def test_apply_transformation_function_multiple_input_output_drop_all(self, mocker):
+        # Arrange
+        mocker.patch("hsfs.client.get_instance")
+        engine._engine_type = "spark"
+        spark_engine = spark.Engine()
+
         @udf([int, int], drop=["col1", "col2"])
         def test(col1, col2):
             return pd.DataFrame({"new_col1": col1 + 1, "new_col2": col2 + 2})
diff --git a/python/tests/fixtures/feature_fixtures.json b/python/tests/fixtures/feature_fixtures.json
index 1d486c0cc4..c9b330768e 100644
--- a/python/tests/fixtures/feature_fixtures.json
+++ b/python/tests/fixtures/feature_fixtures.json
@@ -9,6 +9,22 @@
       "partition": false,
       "primary": true,
       "type": "int",
+      "on_demand": false,
+      "description": "test_description",
+      "feature_group": null
+    }
+  },
+  "get_on_demand": {
+    "response": {
+      "defaultValue": "1",
+      "featureGroupId": 15,
+      "hudiPrecombineKey": true,
+      "name": "intt",
+      "onlineType": "int",
+      "partition": false,
+      "primary": true,
+      "type": "int",
+      "on_demand": true,
       "description": "test_description",
       "feature_group": null
     }
diff --git a/python/tests/fixtures/feature_group_fixtures.json b/python/tests/fixtures/feature_group_fixtures.json
index 484a9e288d..c2394ed4cb 100644
--- a/python/tests/fixtures/feature_group_fixtures.json
+++ b/python/tests/fixtures/feature_group_fixtures.json
@@ -630,5 +630,127 @@
       "version": 1
     },
     "headers": null
+  },
+  "get_transformations": {
+    "response": {
+      "type": "cachedFeaturegroupDTO",
+      "validation_type": "test_validation_type",
+      "created": "2022-08-01T11:07:55Z",
+      "creator": {
+        "email": "admin@hopsworks.ai",
+        "firstName": "Admin",
+        "lastName": "Admin",
+        "maxNumProjects": 0,
+        "numActiveProjects": 0,
+        "numRemainingProjects": 0,
+        "status": 0,
+        "testUser": false,
+        "tos": false,
+        "toursState": 0,
+        "twoFactor": false
+      },
+      "description": "test_description",
+      "featurestoreId": 67,
+      "featurestoreName": "test_featurestore",
+      "id": 15,
+      "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1",
+      "name": "fg_test",
+      "statisticsConfig": {
+        "columns": [],
+        "correlations": false,
+        "enabled": true,
+        "exactUniqueness": false,
+        "histograms": false
+      },
+      "version": 1,
+      "features": [
+        {
+          "defaultValue": null,
+          "featureGroupId": 15,
+          "hudiPrecombineKey": true,
+          "name": "intt",
+          "onlineType": "int",
+          "partition": false,
+          "primary": true,
+          "type": "int"
+        },
+        {
+          "defaultValue": null,
+          "featureGroupId": 15,
+          "hudiPrecombineKey": false,
+          "name": "stringt",
+          "onlineType": "varchar(1000)",
+          "partition": false,
+          "primary": false,
+          "type": "string"
+        }
+      ],
+      "transformation_functions":[
+        {
+          "id" : 1,
+          "version": 2,
+          "featurestoreId": 11,
+          "hopsworksUdf":{
+            "sourceCode": "\n@udf(float)\ndef add_two(data1 : pd.Series):\n    return data1 + 2\n",
+            "name": "add_two",
+            "outputTypes":["double"],
+            "transformationFeatures":["data"],
+            "dropped_features":["data1"]
+          }
+        },
+        {
+          "id" : 2,
+          "version": 1,
+          "featurestoreId": 11,
+          "hopsworksUdf":{
+            "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+            "name": "add_one_fs",
+            "outputTypes":["double"],
+            "transformationFeatures":["col1"],
+            "dropped_features":["data1"]
+          }
+        }
+    ],
+      "onlineTopicName": "119_15_fg_test_1_onlinefs",
+      "onlineEnabled": true,
+      "timeTravelFormat": "HUDI",
+      "expectationSuite": {
+        "expectation_suite_name": "test_expectation_suite_name",
+        "expectations": [
+          {
+            "expectation_type": "1",
+            "kwargs": "{ \"kwargs_key\": \"kwargs_value\" }",
+            "meta": "{ \"meta_key\": \"meta_value\" }",
+            "id": 32
+          }
+        ],
+        "meta": "{ \"great_expectations_version\": \"0.15.12\", \"key\": \"value\" }",
+        "id": 21,
+        "data_asset_type": "test_data_asset_type",
+        "ge_cloud_id": "test_ge_cloud_id",
+        "run_validation": "test_run_validation",
+        "validation_ingestion_policy": "test_validation_ingestion_policy",
+        "feature_store_id": 67,
+        "feature_group_id": 15,
+        "href": "test_/featurestores/67/featuregroups/15/expectationsuite",
+        "expand": "test_expand",
+        "items": "test_items",
+        "type": "expectationSuiteDTO",
+        "created": "test_created"
+      }
+    },
+    "method": "GET",
+    "path_params": [
+      "project",
+      "119",
+      "featurestores",
+      67,
+      "featuregroups",
+      "fg_test"
+    ],
+    "query_params": {
+      "version": 1
+    },
+    "headers": null
   }
 }
diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json
index f48fd0fabd..27cd07f302 100644
--- a/python/tests/fixtures/training_dataset_feature_fixtures.json
+++ b/python/tests/fixtures/training_dataset_feature_fixtures.json
@@ -65,6 +65,84 @@
       "label": "test_label"
     }
   },
+  "get_transformations": {
+    "response": {
+        "name": "test_name",
+        "type": "test_type",
+        "index": "test_index",
+        "transformation_function": {
+          "id" : 2,
+          "version": 1,
+          "featurestoreId": 11,
+          "hopsworksUdf":{
+            "sourceCode": "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n",
+            "name": "add_one_fs",
+            "outputTypes":["double"],
+            "transformationFeatures":["col1"],
+            "dropped_features":["data1"]
+          }
+        },
+        "featuregroup": {
+          "type": "cachedFeaturegroupDTO",
+          "validation_type": "test_validation_type",
+          "created": "2022-08-01T11:07:55Z",
+          "creator": {
+            "email": "admin@hopsworks.ai",
+            "firstName": "Admin",
+            "lastName": "Admin",
+            "maxNumProjects": 0,
+            "numActiveProjects": 0,
+            "numRemainingProjects": 0,
+            "status": 0,
+            "testUser": false,
+            "tos": false,
+            "toursState": 0,
+            "twoFactor": false
+          },
+          "description": "test_description",
+          "featurestoreId": 67,
+          "featurestoreName": "test_featurestore",
+          "id": 15,
+          "location": "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1",
+          "name": "fg_test",
+          "statisticsConfig": {
+            "columns": [],
+            "correlations": false,
+            "enabled": true,
+            "exactUniqueness": false,
+            "histograms": false
+          },
+          "version": 1,
+          "features": [
+            {
+              "defaultValue": null,
+              "featureGroupId": 15,
+              "hudiPrecombineKey": true,
+              "name": "intt",
+              "onlineType": "int",
+              "partition": false,
+              "primary": true,
+              "type": "int"
+            },
+            {
+              "defaultValue": null,
+              "featureGroupId": 15,
+              "hudiPrecombineKey": false,
+              "name": "stringt",
+              "onlineType": "varchar(1000)",
+              "partition": false,
+              "primary": false,
+              "type": "string"
+            }
+          ],
+          "onlineTopicName": "119_15_fg_test_1_onlinefs",
+          "onlineEnabled": true,
+          "timeTravelFormat": "HUDI"
+        },
+        "feature_group_feature_name": "test_feature_group_feature_name",
+        "label": "test_label"
+      }
+  },
   "get_fraud_online_training_dataset_features": {
     "response": [
       {
diff --git a/python/tests/test_builtin_transformation.py b/python/tests/test_builtin_transformation.py
deleted file mode 100644
index 4a8a01af9c..0000000000
--- a/python/tests/test_builtin_transformation.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#
-#   Copyright 2024 Hopsworks AB
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use this file except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-#
-
-import hsfs.engine as engine
-import pandas as pd
-from hsfs.builtin_transformations import (
-    min_max_scaler,
-)
-from hsfs.core.feature_descriptive_statistics import FeatureDescriptiveStatistics
-from hsfs.engine import python
-from hsfs.hopsworks_udf import UDFType
-
-
-class TestBuiltinTransformations:
-    @staticmethod
-    def validate_transformations_python(
-        transformed_outputs, expected_output, expected_col_names
-    ):
-        if isinstance(transformed_outputs, pd.Series):
-            assert transformed_outputs.name == expected_col_names
-        else:
-            assert all(transformed_outputs.columns == expected_col_names)
-        assert all(transformed_outputs.values == expected_output.values)
-
-    def test_min_max_scaler(self):
-        test_dataframe = pd.DataFrame(
-            {
-                "col1": [1, 2, 3, 4],
-                "col2": [1.2, 3.4, 5.6, 9.1],
-            }
-        )
-        statistics_df = test_dataframe.describe().to_dict()
-
-        # Test case 1 : Integer column
-        min_max_scaler_col1 = min_max_scaler("col1")
-        min_max_scaler_col1.udf_type = UDFType.MODEL_DEPENDENT
-
-        min_max_scaler_col1.transformation_statistics = [
-            FeatureDescriptiveStatistics(
-                feature_name="col1",
-                min=statistics_df["col1"]["min"],
-                max=statistics_df["col1"]["max"],
-            )
-        ]
-
-        expected_df = (test_dataframe["col1"] - test_dataframe["col1"].min()) / (
-            test_dataframe["col1"].max() - test_dataframe["col1"].min()
-        )
-
-        # Test with python engine
-        engine.set_instance(engine=python.Engine(), engine_type="python")
-
-        transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"])
-        TestBuiltinTransformations.validate_transformations_python(
-            transformed_outputs=transformed_df,
-            expected_output=expected_df,
-            expected_col_names="min_max_scaler_col1_",
-        )
-
-        # Test with spark engine
-        engine.set_instance(engine=python.Engine(), engine_type="python")
-
-        transformed_df = min_max_scaler_col1.get_udf()(test_dataframe["col1"])
-        TestBuiltinTransformations.validate_transformations_python(
-            transformed_outputs=transformed_df,
-            expected_output=expected_df,
-            expected_col_names="min_max_scaler_col1_",
-        )
diff --git a/python/tests/test_feature.py b/python/tests/test_feature.py
index 8194035040..61ce72f288 100644
--- a/python/tests/test_feature.py
+++ b/python/tests/test_feature.py
@@ -36,6 +36,26 @@ def test_from_response_json(self, backend_fixtures):
         assert f.online_type == "int"
         assert f.default_value == "1"  # default value should be specified as string
         assert f._feature_group_id == 15
+        assert not f.on_demand
+
+    def test_from_response_json_on_demand(self, backend_fixtures):
+        # Arrange
+        json = backend_fixtures["feature"]["get_on_demand"]["response"]
+
+        # Act
+        f = feature.Feature.from_response_json(json)
+
+        # Assert
+        assert f.name == "intt"
+        assert f.type == "int"
+        assert f.description == "test_description"
+        assert f.primary is True
+        assert f.partition is False
+        assert f.hudi_precombine_key is True
+        assert f.online_type == "int"
+        assert f.default_value == "1"  # default value should be specified as string
+        assert f._feature_group_id == 15
+        assert f.on_demand
 
     def test_from_response_json_basic_info(self, backend_fixtures):
         # Arrange
diff --git a/python/tests/test_feature_group.py b/python/tests/test_feature_group.py
index 56b870d23e..8e2ba67cdf 100644
--- a/python/tests/test_feature_group.py
+++ b/python/tests/test_feature_group.py
@@ -32,6 +32,7 @@
 )
 from hsfs.client.exceptions import FeatureStoreException, RestAPIError
 from hsfs.engine import python
+from hsfs.hopsworks_udf import UDFType
 
 
 engine.init("python")
@@ -145,7 +146,7 @@ def test_from_response_json_basic_info(self, backend_fixtures):
         assert fg._feature_store_id == 67
         assert fg.description == ""
         assert fg.partition_key == []
-        assert fg.primary_key == ['intt']
+        assert fg.primary_key == ["intt"]
         assert fg.hudi_precombine_key is None
         assert fg._feature_store_name is None
         assert fg.created is None
@@ -322,7 +323,7 @@ def test_constructor_with_list_event_time_for_compatibility(
                 version=1,
                 description="fg_description",
                 event_time=["event_date"],
-                features=features
+                features=features,
             )
         with pytest.raises(FeatureStoreException):
             util.verify_attribute_key_names(new_fg, False)
@@ -885,3 +886,57 @@ def test_feature_group_save_expectation_suite_from_hopsworks_type(
             mock_print.call_args[0][0][:63]
             == "Updated expectation suite attached to Feature Group, edit it at"
         )
+
+    def test_from_response_json_transformation_functions(self, backend_fixtures):
+        # Arrange
+        json = backend_fixtures["feature_group"]["get_transformations"]["response"]
+
+        # Act
+        fg = feature_group.FeatureGroup.from_response_json(json)
+
+        # Assert
+        assert fg.name == "fg_test"
+        assert fg.version == 1
+        assert fg._feature_store_id == 67
+        assert fg.description == "test_description"
+        assert fg.partition_key == []
+        assert fg.primary_key == ["intt"]
+        assert fg.hudi_precombine_key == "intt"
+        assert fg._feature_store_name == "test_featurestore"
+        assert fg.created == "2022-08-01T11:07:55Z"
+        assert len(fg.transformation_functions) == 2
+        assert (
+            fg.transformation_functions[0].hopsworks_udf.function_name == "add_one_fs"
+        )
+        assert fg.transformation_functions[1].hopsworks_udf.function_name == "add_two"
+        assert (
+            fg.transformation_functions[0].hopsworks_udf._function_source
+            == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
+        )
+        assert (
+            fg.transformation_functions[1].hopsworks_udf._function_source
+            == "\n@udf(float)\ndef add_two(data1 : pd.Series):\n    return data1 + 2\n"
+        )
+        assert (
+            fg.transformation_functions[0].hopsworks_udf.udf_type == UDFType.ON_DEMAND
+        )
+        assert (
+            fg.transformation_functions[1].hopsworks_udf.udf_type == UDFType.ON_DEMAND
+        )
+        assert isinstance(fg.creator, user.User)
+        assert fg.id == 15
+        assert len(fg.features) == 2
+        assert isinstance(fg.features[0], feature.Feature)
+        assert (
+            fg.location
+            == "hopsfs://10.0.2.15:8020/apps/hive/warehouse/test_featurestore.db/fg_test_1"
+        )
+        assert fg.online_enabled is True
+        assert fg.time_travel_format == "HUDI"
+        assert isinstance(fg.statistics_config, statistics_config.StatisticsConfig)
+        assert fg._online_topic_name == "119_15_fg_test_1_onlinefs"
+        assert fg.event_time is None
+        assert fg.stream is False
+        assert (
+            fg.expectation_suite.expectation_suite_name == "test_expectation_suite_name"
+        )
diff --git a/python/tests/test_feature_view.py b/python/tests/test_feature_view.py
index a45093126b..57aa5c1b93 100644
--- a/python/tests/test_feature_view.py
+++ b/python/tests/test_feature_view.py
@@ -18,7 +18,7 @@
 from hsfs import feature_view, training_dataset_feature
 from hsfs.constructor import fs_query, query
 from hsfs.feature_store import FeatureStore
-from hsfs.hopsworks_udf import udf
+from hsfs.hopsworks_udf import UDFType, udf
 
 
 class TestFeatureView:
@@ -106,6 +106,15 @@ def test_from_response_json_transformation_function(self, mocker, backend_fixtur
             fv.transformation_functions[1].hopsworks_udf._function_source
             == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
         )
+        assert (
+            fv.transformation_functions[0].hopsworks_udf.udf_type
+            == UDFType.MODEL_DEPENDENT
+        )
+        assert (
+            fv.transformation_functions[1].hopsworks_udf.udf_type
+            == UDFType.MODEL_DEPENDENT
+        )
+
         assert len(fv.schema) == 2
         assert isinstance(fv.schema[0], training_dataset_feature.TrainingDatasetFeature)
 
diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
index fe9531b751..dfef840067 100644
--- a/python/tests/test_hopswork_udf.py
+++ b/python/tests/test_hopswork_udf.py
@@ -336,6 +336,21 @@ def test_func(col1):
         test_func.udf_type = UDFType.ON_DEMAND
         assert test_func._get_output_column_names() == ["test_func"]
 
+    def test_generate_output_column_names_one_argument_one_output_type_prefix(self):
+        @udf(int)
+        def test_func(col1):
+            return col1 + 1
+
+        test_func._feature_name_prefix = "prefix_"
+
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+        assert test_func._get_output_column_names() == ["test_func_prefix_col1_"]
+        assert test_func.output_column_names == ["prefix_test_func_prefix_col1_"]
+
+        test_func.udf_type = UDFType.ON_DEMAND
+        assert test_func._get_output_column_names() == ["test_func"]
+        assert test_func.output_column_names == ["prefix_test_func"]
+
     def test_generate_output_column_names_multiple_argument_one_output_type(self):
         @udf(int)
         def test_func(col1, col2, col3):
@@ -346,6 +361,26 @@ def test_func(col1, col2, col3):
         test_func.udf_type = UDFType.ON_DEMAND
         assert test_func._get_output_column_names() == ["test_func"]
 
+    def test_generate_output_column_names_multiple_argument_one_output_type_prefix(
+        self,
+    ):
+        @udf(int)
+        def test_func(col1, col2, col3):
+            return col1 + 1
+
+        test_func._feature_name_prefix = "prefix_"
+
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+        assert test_func._get_output_column_names() == [
+            "test_func_prefix_col1_prefix_col2_prefix_col3_"
+        ]
+        assert test_func.output_column_names == [
+            "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_"
+        ]
+        test_func.udf_type = UDFType.ON_DEMAND
+        assert test_func._get_output_column_names() == ["test_func"]
+        assert test_func.output_column_names == ["prefix_test_func"]
+
     def test_generate_output_column_names_single_argument_multiple_output_type(self):
         @udf([int, float, int])
         def test_func(col1):
@@ -360,6 +395,29 @@ def test_func(col1):
             "test_func_col1_2",
         ]
 
+    def test_generate_output_column_names_single_argument_multiple_output_type_prefix(
+        self,
+    ):
+        @udf([int, float, int])
+        def test_func(col1):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col1 + 1], "col3": [col1 + 1]}
+            )
+
+        test_func._feature_name_prefix = "prefix_"
+
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+        assert test_func._get_output_column_names() == [
+            "test_func_prefix_col1_0",
+            "test_func_prefix_col1_1",
+            "test_func_prefix_col1_2",
+        ]
+        assert test_func.output_column_names == [
+            "prefix_test_func_prefix_col1_0",
+            "prefix_test_func_prefix_col1_1",
+            "prefix_test_func_prefix_col1_2",
+        ]
+
     def test_generate_output_column_names_multiple_argument_multiple_output_type(self):
         @udf([int, float, int])
         def test_func(col1, col2, col3):
@@ -374,6 +432,91 @@ def test_func(col1, col2, col3):
             "test_func_col1_col2_col3_2",
         ]
 
+    def test_generate_output_column_names_multiple_argument_multiple_output_type_prefix(
+        self,
+    ):
+        @udf([int, float, int])
+        def test_func(col1, col2, col3):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+            )
+
+        test_func._feature_name_prefix = "prefix_"
+
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+        assert test_func._get_output_column_names() == [
+            "test_func_prefix_col1_prefix_col2_prefix_col3_0",
+            "test_func_prefix_col1_prefix_col2_prefix_col3_1",
+            "test_func_prefix_col1_prefix_col2_prefix_col3_2",
+        ]
+        assert test_func.output_column_names == [
+            "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_0",
+            "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_1",
+            "prefix_test_func_prefix_col1_prefix_col2_prefix_col3_2",
+        ]
+
+    def test_drop_features_one_element(self):
+        @udf([int, float, int], drop="col1")
+        def test_func(col1, col2, col3):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+            )
+
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+
+        assert test_func.dropped_features == ["col1"]
+
+    def test_drop_features_one_element_prefix(self):
+        @udf([int, float, int], drop="col1")
+        def test_func(col1, col2, col3):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+            )
+
+        test_func._feature_name_prefix = "prefix_"
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+
+        assert test_func._dropped_features == ["col1"]
+        assert test_func.dropped_features == ["prefix_col1"]
+
+    def test_drop_features_multiple_element(self):
+        @udf([int, float, int], drop=["col1", "col2"])
+        def test_func(col1, col2, col3):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+            )
+
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+
+        assert test_func.dropped_features == ["col1", "col2"]
+
+    def test_drop_features_multiple_element_prefix(self):
+        @udf([int, float, int], drop=["col1", "col2"])
+        def test_func(col1, col2, col3):
+            return pd.DataFrame(
+                {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+            )
+
+        test_func._feature_name_prefix = "prefix_"
+        test_func.udf_type = UDFType.MODEL_DEPENDENT
+
+        assert test_func._dropped_features == ["col1", "col2"]
+        assert test_func.dropped_features == ["prefix_col1", "prefix_col2"]
+
+    def test_drop_features_invalid(self):
+        with pytest.raises(FeatureStoreException) as exp:
+
+            @udf([int, float, int], drop=["col1", "invalid_col"])
+            def test_func(col1, col2, col3):
+                return pd.DataFrame(
+                    {"col1": [col1 + 1], "col2": [col2 + 1], "col3": [col3 + 1]}
+                )
+
+        assert (
+            str(exp.value)
+            == "Cannot drop features 'invalid_col' as they are not features given as arguments in the defined UDF."
+        )
+
     def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
         @udf(int)
         def test_func(col1):
@@ -460,6 +603,16 @@ def test_func(col1):
         assert test_func("new_feature").transformation_features == ["new_feature"]
         assert test_func("new_feature").statistics_features == []
 
+        # Test with prefix
+        test_func._feature_name_prefix = "prefix_"
+        assert test_func.transformation_features == ["prefix_col1"]
+        assert test_func.statistics_features == []
+
+        assert test_func("new_feature").transformation_features == [
+            "prefix_new_feature"
+        ]
+        assert test_func("new_feature").statistics_features == []
+
     def test_HopsworkUDf_call_one_argument_statistics(self):
         from hsfs.transformation_statistics import TransformationStatistics
 
@@ -477,6 +630,18 @@ def test_func(col1, statistics=stats):
         assert test_func("new_feature").statistics_features == ["new_feature"]
         assert test_func("new_feature")._statistics_argument_names == ["col1"]
 
+        # Test with prefix
+        test_func._feature_name_prefix = "prefix_"
+        assert test_func.transformation_features == ["prefix_col1"]
+        assert test_func.statistics_features == ["col1"]
+        assert test_func._statistics_argument_names == ["col1"]
+
+        assert test_func("new_feature").transformation_features == [
+            "prefix_new_feature"
+        ]
+        assert test_func("new_feature").statistics_features == ["new_feature"]
+        assert test_func("new_feature")._statistics_argument_names == ["col1"]
+
     def test_HopsworkUDf_call_multiple_argument_statistics(self):
         from hsfs.transformation_statistics import TransformationStatistics
 
@@ -495,3 +660,121 @@ def test_func(col1, col2, col3, statistics=stats):
             "col1",
             "col3",
         ]
+
+    def test_validate_and_convert_drop_features(self):
+        dropped_features = "feature1"
+        transformation_feature = ["feature1", "feature2"]
+        feature_name_prefix = None
+
+        dropped_features = HopsworksUdf._validate_and_convert_drop_features(
+            dropped_features, transformation_feature, feature_name_prefix
+        )
+
+        assert dropped_features == ["feature1"]
+
+    def test_validate_and_convert_drop_features_dropped_list(self):
+        dropped_features = ["feature1", "feature2"]
+        transformation_feature = ["feature1", "feature2", "feature3"]
+        feature_name_prefix = None
+
+        dropped_features = HopsworksUdf._validate_and_convert_drop_features(
+            dropped_features, transformation_feature, feature_name_prefix
+        )
+
+        assert dropped_features == ["feature1", "feature2"]
+
+    def test_validate_and_convert_drop_features_dropped_invalid(self):
+        dropped_features = "feature4"
+        transformation_feature = ["feature1", "feature2", "feature3"]
+        feature_name_prefix = None
+
+        with pytest.raises(FeatureStoreException) as exp:
+            HopsworksUdf._validate_and_convert_drop_features(
+                dropped_features, transformation_feature, feature_name_prefix
+            )
+
+        assert (
+            str(exp.value)
+            == "Cannot drop features 'feature4' as they are not features given as arguments in the defined UDF."
+        )
+
+    def test_validate_and_convert_drop_features_dropped_invalid_list(self):
+        dropped_features = ["feature4", "feature5"]
+        transformation_feature = ["feature1", "feature2", "feature3"]
+        feature_name_prefix = None
+
+        with pytest.raises(FeatureStoreException) as exp:
+            HopsworksUdf._validate_and_convert_drop_features(
+                dropped_features, transformation_feature, feature_name_prefix
+            )
+
+        assert (
+            str(exp.value)
+            == "Cannot drop features 'feature4', 'feature5' as they are not features given as arguments in the defined UDF."
+        )
+
+    def test_validate_and_convert_drop_features_dropped_list_prefix(self):
+        dropped_features = ["feature1", "feature2"]
+        transformation_feature = ["test_feature1", "test_feature2", "test_feature3"]
+        feature_name_prefix = "test_"
+
+        dropped_features = HopsworksUdf._validate_and_convert_drop_features(
+            dropped_features, transformation_feature, feature_name_prefix
+        )
+
+        assert dropped_features == ["feature1", "feature2"]
+
+    def test_validate_and_convert_drop_features_dropped_prefix_invalid(self):
+        dropped_features = ["feature1", "feature2"]
+        transformation_feature = ["feature1", "feature2", "feature3"]
+        feature_name_prefix = "test_"
+
+        with pytest.raises(FeatureStoreException) as exp:
+            HopsworksUdf._validate_and_convert_drop_features(
+                dropped_features, transformation_feature, feature_name_prefix
+            )
+
+        assert (
+            str(exp.value)
+            == "Cannot drop features 'test_feature1', 'test_feature2' as they are not features given as arguments in the defined UDF."
+        )
+
+    def test_validate_udf_type_None(self):
+        @udf(int)
+        def test_func(col1):
+            return col1 + 1
+
+        with pytest.raises(FeatureStoreException) as exe:
+            test_func._validate_udf_type()
+
+        assert str(exe.value) == "UDF Type cannot be None"
+
+    def test_validate_udf_type_on_demand_multiple_output(self):
+        @udf([int, float])
+        def test_func(col1, col2):
+            return pd.DataFrame({"out1": col1 + 1, "out2": col2 + 2})
+
+        with pytest.raises(FeatureStoreException) as exe:
+            test_func.udf_type = UDFType.ON_DEMAND
+
+        assert (
+            str(exe.value)
+            == "On-Demand Transformation functions can only return one column as output"
+        )
+
+    def test_validate_udf_type_on_demand_statistics(self):
+        from hsfs.transformation_statistics import TransformationStatistics
+
+        stats = TransformationStatistics("col1")
+
+        @udf(int)
+        def test_func(col1, statistics=stats):
+            return col1 + statistics.col1.mean
+
+        with pytest.raises(FeatureStoreException) as exe:
+            test_func.udf_type = UDFType.ON_DEMAND
+
+        assert (
+            str(exe.value)
+            == "On-Demand Transformation functions cannot use statistics, please remove statistics parameters from the functions"
+        )
diff --git a/python/tests/test_training_dataset_feature.py b/python/tests/test_training_dataset_feature.py
index dc5af26112..81c7fd6d14 100644
--- a/python/tests/test_training_dataset_feature.py
+++ b/python/tests/test_training_dataset_feature.py
@@ -16,6 +16,7 @@
 
 
 from hsfs import feature_group, training_dataset_feature
+from hsfs.hopsworks_udf import UDFType
 
 
 class TestTrainingDatasetFeature:
@@ -38,6 +39,40 @@ def test_from_response_json(self, backend_fixtures):
         )
         assert td_feature.label == "test_label"
 
+    def test_from_response_json_on_demand_transformation(self, backend_fixtures):
+        # Arrange
+        json = backend_fixtures["training_dataset_feature"]["get_transformations"][
+            "response"
+        ]
+
+        # Act
+        td_feature = training_dataset_feature.TrainingDatasetFeature.from_response_json(
+            json
+        )
+
+        # Assert
+        assert td_feature.name == "test_name"
+        assert td_feature.type == "test_type"
+        assert td_feature.index == "test_index"
+        assert (
+            td_feature.on_demand_transformation_function.hopsworks_udf.function_name
+            == "add_one_fs"
+        )
+
+        assert (
+            td_feature.on_demand_transformation_function.hopsworks_udf._function_source
+            == "\n@udf(float)\ndef add_one_fs(data1 : pd.Series):\n    return data1 + 1\n"
+        )
+        assert (
+            td_feature.on_demand_transformation_function.hopsworks_udf.udf_type
+            == UDFType.ON_DEMAND
+        )
+        assert isinstance(td_feature._feature_group, feature_group.FeatureGroup)
+        assert (
+            td_feature._feature_group_feature_name == "test_feature_group_feature_name"
+        )
+        assert td_feature.label == "test_label"
+
     def test_from_response_json_basic_info(self, backend_fixtures):
         # Arrange
         json = backend_fixtures["training_dataset_feature"]["get_basic_info"][

From 99001d22dd380765ce20bbdc08d8a41e43c406f3 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Thu, 4 Jul 2024 12:24:46 +0200
Subject: [PATCH 56/58] adding documentation

---
 .../core/transformation_function_engine.py    |  9 ++++++
 python/hsfs/feature_store.py                  | 32 ++++++++++++++++---
 python/hsfs/feature_view.py                   |  6 ++--
 python/hsfs/hopsworks_udf.py                  | 26 ++++++++++++---
 4 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/python/hsfs/core/transformation_function_engine.py b/python/hsfs/core/transformation_function_engine.py
index 6bdbff13c9..4ab8c6a166 100644
--- a/python/hsfs/core/transformation_function_engine.py
+++ b/python/hsfs/core/transformation_function_engine.py
@@ -147,6 +147,15 @@ def get_ready_to_use_transformation_fns(
         feature_view: feature_view.FeatureView,
         training_dataset_version: Optional[int] = None,
     ) -> List[transformation_function.TransformationFunction]:
+        """
+        Function that updates statistics required for all transformation functions in the feature view based on training dataset version.
+
+        # Arguments
+            feature_view `FeatureView`: The feature view in which the training data is being created.
+            training_dataset_version `TrainingDataset`: The training version used to update the statistics used in the transformation functions.
+        # Returns
+            `List[transformation_function.TransformationFunction]` : List of transformation functions.
+        """
         # check if transformation functions require statistics
         is_stat_required = any(
             [
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
index 4da096d80c..2ec47f312e 100644
--- a/python/hsfs/feature_store.py
+++ b/python/hsfs/feature_store.py
@@ -521,13 +521,26 @@ def create_feature_group(
             # connect to the Feature Store
             fs = ...
 
+            # define the on-demand transformation functions
+            @udf(int)
+            def plus_one(value):
+                return value + 1
+
+            @udf(int)
+            def plus_two(value):
+                return value + 2
+
+            # construct list of "transformation functions" on features
+            transformation_functions = [plus_one("feature1"), plus_two("feature2"))]
+
             fg = fs.create_feature_group(
                     name='air_quality',
                     description='Air Quality characteristics of each day',
                     version=1,
                     primary_key=['city','date'],
                     online_enabled=True,
-                    event_time='date'
+                    event_time='date',
+                    transformation_functions=transformation_functions
                 )
             ```
 
@@ -595,7 +608,9 @@ def create_feature_group(
                 defaults to using project topic.
             notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries
                 are inserted or updated on the online feature store. If left undefined no notifications are sent.
-            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
+            transformation_functions: On-Demand Transformation functions attached to the feature group.
+                It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator.
+                Defaults to `None`, no transformations.
 
         # Returns
             `FeatureGroup`. The feature group metadata object.
@@ -669,6 +684,7 @@ def get_or_create_feature_group(
                     primary_key=["day", "area"],
                     online_enabled=True,
                     event_time="timestamp",
+                    transformation_functions=transformation_functions,
                     )
             ```
 
@@ -734,7 +750,9 @@ def get_or_create_feature_group(
                 defaults to using project topic.
             notification_topic_name: Optionally, define the name of the topic used for sending notifications when entries
                 are inserted or updated on the online feature store. If left undefined no notifications are sent.
-            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
+            transformation_functions: On-Demand Transformation functions attached to the feature group.
+                It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator.
+                Defaults to `None`, no transformations.
 
         # Returns
             `FeatureGroup`. The feature group metadata object.
@@ -1543,7 +1561,9 @@ def plus_one(value):
                 Training helper columns can be optionally fetched with training data. For more details see
                 documentation for feature view's get training data methods.  Defaults to `[], no training helper
                 columns.
-            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
+            transformation_functions: Model Dependent Transformation functions attached to the feature view.
+                It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator.
+                Defaults to `None`, no transformations.
 
         # Returns:
             `FeatureView`: The feature view metadata object.
@@ -1618,7 +1638,9 @@ def get_or_create_feature_view(
                 Training helper columns can be optionally fetched with training data. For more details see
                 documentation for feature view's get training data methods.  Defaults to `[], no training helper
                 columns.
-            transformation_functions: A list of Hopsworks UDF's. Defaults to `None`, no transformations.
+            transformation_functions: Model Dependent Transformation functions attached to the feature view.
+                It can be a list of list of user defined functions defined using the hopsworks `@udf` decorator.
+                Defaults to `None`, no transformations.
 
         # Returns:
             `FeatureView`: The feature view metadata object.
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index f2f5019160..0045ecd713 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -568,7 +568,7 @@ def get_feature_vector(
             force_sql_client: boolean, defaults to False. If set to True, reads from online feature store
                 using the SQL client if initialised.
             allow_missing: Setting to `True` returns feature vectors with missing values.
-            request_parameters: Request parameters required by on-demand transformation functions.
+            request_parameters: Request parameters required by on-demand transformation functions to compute on-demand features present in the feature view.
 
         # Returns
             `list`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list"`, `"pandas"`, `"polars"` or `"numpy"`
@@ -678,6 +678,7 @@ def get_feature_vectors(
             force_rest_client: boolean, defaults to False. If set to True, reads from online feature store
                 using the REST client if initialised.
             allow_missing: Setting to `True` returns feature vectors with missing values.
+            request_parameters: Request parameters required by on-demand transformation functions to compute on-demand features present in the feature view.
 
         # Returns
             `List[list]`, `pd.DataFrame`, `polars.DataFrame` or `np.ndarray` if `return type` is set to `"list", `"pandas"`,`"polars"` or `"numpy"`
@@ -859,9 +860,6 @@ def find_neighbors(
         the number of results returned may be less than k. Try using a large value of k and extract the top k
         items from the results if needed.
 
-        !!! warning "Duplicate column error in Polars"
-            If the feature view has duplicate column names, attempting to create a polars DataFrame
-            will raise an error. To avoid this, set `return_type` to `"list"` or `"pandas"`.
 
         # Arguments
             embedding: The target embedding for which neighbors are to be found.
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index 10e8135293..f75c9f861e 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -33,6 +33,10 @@
 
 
 class UDFType(Enum):
+    """
+    Class that store the possible types of transformation functions.
+    """
+
     MODEL_DEPENDENT = "model_dependent"
     ON_DEMAND = "on_demand"
 
@@ -111,11 +115,14 @@ class HopsworksUdf:
 
     Attributes
     ----------
-        output_type (List[str]) : Output types of the columns returned from the UDF.
         function_name (str) : Name of the UDF
-        statistics_required (bool) : True if statistics is required for any of the parameters of the UDF.
-        transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable.
+        udf_type (UDFType): Type of the UDF can be either \"model dependent\" or \"on-demand\".
+        return_types (List[str]): The data types of the columns returned from the UDF.
         transformation_features (List[str]) : List of feature names to which the transformation function would be applied.
+        output_column_names (List[str]): Column names of the DataFrame returned after application of the transformation function.
+        dropped_features (List[str]): List of features that will be dropped after the UDF is applied.
+        transformation_statistics (Dict[str, FeatureDescriptiveStatistics]): Dictionary that maps the statistics_argument name in the function to the actual statistics variable.
+        statistics_required (bool) : True if statistics is required for any of the parameters of the UDF.
         statistics_features (List[str]) : List of feature names that requires statistics.
     """
 
@@ -715,6 +722,12 @@ def from_response_json(
         return hopsworks_udf
 
     def _validate_udf_type(self):
+        """
+        Function that returns validates if the defined transformation function can be used for the specified UDF type.
+
+        # Raises
+            `hsfs.client.exceptions.FeatureStoreException` : If the UDF Type is None or if statistics or multiple columns has been output by a on-demand transformation function
+        """
         if self.udf_type is None:
             raise FeatureStoreException("UDF Type cannot be None")
 
@@ -785,7 +798,7 @@ def transformation_features(self) -> List[str]:
     @property
     def statistics_features(self) -> List[str]:
         """
-        list of feature names that require statistics
+        List of feature names that require statistics
         """
         return [
             transformation_feature.feature_name
@@ -806,7 +819,7 @@ def _statistics_argument_mapping(self) -> Dict[str, str]:
     @property
     def _statistics_argument_names(self) -> List[str]:
         """
-        list of argument names required for statistics
+        List of argument names required for statistics
         """
         return [
             transformation_feature.statistic_argument_name
@@ -827,6 +840,9 @@ def udf_type(self, udf_type: UDFType) -> None:
 
     @property
     def dropped_features(self) -> List[str]:
+        """
+        List of features that will be dropped after the UDF is applied.
+        """
         if self._feature_name_prefix:
             return [
                 self._feature_name_prefix + dropped_feature

From c71af3bb16db654876a54da7530a448b3c07dc20 Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Mon, 8 Jul 2024 11:09:40 +0200
Subject: [PATCH 57/58] adopting changes in backend for UI

---
 python/hsfs/engine/python.py                  |  7 +-
 python/hsfs/engine/spark.py                   |  7 +-
 python/hsfs/feature_group.py                  | 34 ++++----
 python/hsfs/feature_view.py                   | 34 ++++----
 python/hsfs/hopsworks_udf.py                  | 85 +++++++++++++------
 python/hsfs/transformation_function.py        |  4 +-
 ...t_python_spark_transformation_functions.py |  6 +-
 .../fixtures/feature_group_fixtures.json      |  4 +-
 .../tests/fixtures/feature_view_fixtures.json |  4 +-
 .../training_dataset_feature_fixtures.json    |  2 +-
 .../transformation_function_fixtures.json     | 14 +--
 11 files changed, 123 insertions(+), 78 deletions(-)

diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
index 9c2a4ca279..b0efd7be0e 100644
--- a/python/hsfs/engine/python.py
+++ b/python/hsfs/engine/python.py
@@ -827,7 +827,8 @@ def parse_schema_feature_group(
                         on_demand=True,
                     )
                 )
-                dropped_features.extend(tf.hopsworks_udf.dropped_features)
+                if tf.hopsworks_udf.dropped_features:
+                    dropped_features.extend(tf.hopsworks_udf.dropped_features)
         for feat_name in arrow_schema.names:
             name = util.autofix_feature_name(feat_name)
             try:
@@ -1364,8 +1365,8 @@ def _apply_transformation_function(
                 raise FeatureStoreException(
                     f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
-
-            dropped_features.update(tf.hopsworks_udf.dropped_features)
+            if tf.hopsworks_udf.dropped_features:
+                dropped_features.update(tf.hopsworks_udf.dropped_features)
             dataset = pd.concat(
                 [
                     dataset,
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
index 60f5f14854..322e9e993a 100644
--- a/python/hsfs/engine/spark.py
+++ b/python/hsfs/engine/spark.py
@@ -1145,7 +1145,8 @@ def parse_schema_feature_group(
                         on_demand=True,
                     )
                 )
-                dropped_features.extend(tf.hopsworks_udf.dropped_features)
+                if tf.hopsworks_udf.dropped_features:
+                    dropped_features.extend(tf.hopsworks_udf.dropped_features)
 
         using_hudi = time_travel_format == "HUDI"
         for feat in dataframe.schema:
@@ -1290,8 +1291,8 @@ def _apply_transformation_function(
                 raise FeatureStoreException(
                     f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly."
                 )
-
-            dropped_features.update(tf.hopsworks_udf.dropped_features)
+            if tf.hopsworks_udf.dropped_features:
+                dropped_features.update(tf.hopsworks_udf.dropped_features)
 
             pandas_udf = hopsworks_udf.get_udf()
             output_col_name = hopsworks_udf.output_column_names[0]
diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py
index 0bbeb26552..8240f115e9 100644
--- a/python/hsfs/feature_group.py
+++ b/python/hsfs/feature_group.py
@@ -2135,21 +2135,25 @@ def __init__(
         self._writer: Optional[callable] = None
 
         # On-Demand Transformation Functions
-        self._transformation_functions: List[TransformationFunction] = (
-            [
-                TransformationFunction(
-                    featurestore_id,
-                    hopsworks_udf=transformation_function,
-                    version=1,
-                    transformation_type=UDFType.ON_DEMAND,
-                )
-                if not isinstance(transformation_function, TransformationFunction)
-                else transformation_function
-                for transformation_function in transformation_functions
-            ]
-            if transformation_functions
-            else []
-        )
+        self._transformation_functions: List[TransformationFunction] = []
+
+        if transformation_functions:
+            for transformation_function in transformation_functions:
+                if not isinstance(transformation_function, TransformationFunction):
+                    self._transformation_functions.append(
+                        TransformationFunction(
+                            featurestore_id,
+                            hopsworks_udf=transformation_function,
+                            version=1,
+                            transformation_type=UDFType.ON_DEMAND,
+                        )
+                    )
+                else:
+                    if not transformation_function.hopsworks_udf.udf_type:
+                        transformation_function.hopsworks_udf.udf_type = (
+                            UDFType.ON_DEMAND
+                        )
+                    self._transformation_functions.append(transformation_function)
 
         if self._transformation_functions:
             self._transformation_functions = (
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 0045ecd713..fc9151ae94 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -120,21 +120,25 @@ def __init__(
             training_helper_columns if training_helper_columns else []
         )
 
-        self._transformation_functions: List[TransformationFunction] = (
-            [
-                TransformationFunction(
-                    self.featurestore_id,
-                    hopsworks_udf=transformation_function,
-                    version=1,
-                    transformation_type=UDFType.MODEL_DEPENDENT,
-                )
-                if not isinstance(transformation_function, TransformationFunction)
-                else transformation_function
-                for transformation_function in transformation_functions
-            ]
-            if transformation_functions
-            else []
-        )
+        self._transformation_functions: List[TransformationFunction] = []
+
+        if transformation_functions:
+            for transformation_function in transformation_functions:
+                if not isinstance(transformation_function, TransformationFunction):
+                    self._transformation_functions.append(
+                        TransformationFunction(
+                            self.featurestore_id,
+                            hopsworks_udf=transformation_function,
+                            version=1,
+                            transformation_type=UDFType.MODEL_DEPENDENT,
+                        )
+                    )
+                else:
+                    if not transformation_function.hopsworks_udf.udf_type:
+                        transformation_function.hopsworks_udf.udf_type = (
+                            UDFType.MODEL_DEPENDENT
+                        )
+                    self._transformation_functions.append(transformation_function)
 
         if self._transformation_functions:
             self._transformation_functions = FeatureView._sort_transformation_functions(
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
index f75c9f861e..697eb06f38 100644
--- a/python/hsfs/hopsworks_udf.py
+++ b/python/hsfs/hopsworks_udf.py
@@ -75,7 +75,9 @@ def add_one(data1 : pd.Series):
     """
 
     def wrapper(func: Callable) -> HopsworksUdf:
-        udf = HopsworksUdf(func=func, return_types=return_type, dropped_features=drop)
+        udf = HopsworksUdf(
+            func=func, return_types=return_type, dropped_argument_names=drop
+        )
         return udf
 
     return wrapper
@@ -143,7 +145,11 @@ def __init__(
         return_types: Union[List[type], type, List[str], str],
         name: Optional[str] = None,
         transformation_features: Optional[List[TransformationFeature]] = None,
-        dropped_features: Optional[List[str]] = None,
+        transformation_function_argument_names: Optional[
+            List[TransformationFeature]
+        ] = None,
+        dropped_argument_names: Optional[List[str]] = None,
+        dropped_feature_names: Optional[List[str]] = None,
         feature_name_prefix: Optional[str] = None,
     ):
         self._return_types: List[str] = HopsworksUdf._validate_and_convert_output_types(
@@ -162,24 +168,41 @@ def __init__(
             else func
         )
         if not transformation_features:
+            # New transformation function being declared so extract source code from function
             self._transformation_features: List[TransformationFeature] = (
                 HopsworksUdf._extract_function_arguments(func)
                 if not transformation_features
                 else transformation_features
             )
+
+            self._transformation_function_argument_names = [
+                feature.feature_name for feature in self._transformation_features
+            ]
+
+            self._dropped_argument_names: List[str] = (
+                HopsworksUdf._validate_and_convert_drop_features(
+                    dropped_argument_names,
+                    self.transformation_features,
+                    feature_name_prefix,
+                )
+            )
+            self._dropped_features = self._dropped_argument_names
         else:
             self._transformation_features = transformation_features
+            self._transformation_function_argument_names = (
+                transformation_function_argument_names
+            )
+            self._dropped_argument_names = dropped_argument_names
+            self._dropped_features = (
+                dropped_feature_names
+                if dropped_feature_names
+                else dropped_argument_names
+            )
 
         self._formatted_function_source, self._module_imports = (
             HopsworksUdf._format_source_code(self._function_source)
         )
 
-        self._dropped_features: List[str] = (
-            HopsworksUdf._validate_and_convert_drop_features(
-                dropped_features, self.transformation_features, feature_name_prefix
-            )
-        )
-
         self._statistics: Optional[TransformationStatistics] = None
 
         self._udf_type: UDFType = None
@@ -201,7 +224,7 @@ def _validate_and_convert_drop_features(
             `List[str]`: A list of features to be dropped.
         """
         if not dropped_features:
-            return []
+            return None
 
         dropped_features = (
             [dropped_features]
@@ -554,11 +577,16 @@ def __call__(self, *features: List[str]) -> "HopsworksUdf":
                     f'Feature names provided must be string "{arg}" is not string'
                 )
         transformation_feature_name = self.transformation_features
-        index_dropped_features = [
-            transformation_feature_name.index(dropped_feature)
-            for dropped_feature in self.dropped_features
-        ]
-        updated_dropped_features = [features[index] for index in index_dropped_features]
+        if self.dropped_features:
+            index_dropped_features = [
+                transformation_feature_name.index(dropped_feature)
+                for dropped_feature in self.dropped_features
+            ]
+            updated_dropped_features = [
+                features[index] for index in index_dropped_features
+            ]
+        else:
+            updated_dropped_features = None
 
         # Create a copy of the UDF to associate it with new feature names.
         udf = copy.deepcopy(self)
@@ -601,6 +629,8 @@ def get_udf(self, force_python_udf: bool = False) -> Callable:
         # Returns
             `Callable`: Pandas UDF in the spark engine otherwise returns a python function for the UDF.
         """
+        if self.udf_type is None:
+            raise FeatureStoreException("UDF Type cannot be None")
 
         if engine.get_type() in ["hive", "python", "training"] or force_python_udf:
             return self.hopsworksUdf_wrapper()
@@ -623,7 +653,8 @@ def to_dict(self) -> Dict[str, Any]:
             "sourceCode": self._function_source,
             "outputTypes": self.return_types,
             "transformationFeatures": self.transformation_features,
-            "droppedFeatures": self.dropped_features,
+            "transformationFunctionArgumentNames": self._transformation_function_argument_names,
+            "droppedArgumentNames": self._dropped_argument_names,
             "statisticsArgumentNames": self._statistics_argument_names
             if self.statistics_required
             else None,
@@ -663,12 +694,12 @@ def from_response_json(
         transformation_features = [
             feature.strip() for feature in json_decamelized["transformation_features"]
         ]
-        dropped_features = (
+        dropped_argument_names = (
             [
                 dropped_feature.strip()
-                for dropped_feature in json_decamelized["dropped_features"]
+                for dropped_feature in json_decamelized["dropped_argument_names"]
             ]
-            if "dropped_features" in json_decamelized
+            if "dropped_argument_names" in json_decamelized
             else None
         )
         statistics_features = (
@@ -687,11 +718,14 @@ def from_response_json(
             arg_list if not transformation_features else transformation_features
         )
 
-        if dropped_features:
-            dropped_features = [
-                transformation_features[arg_list.index(dropped_feature)]
-                for dropped_feature in dropped_features
+        dropped_feature_names = (
+            [
+                transformation_features[arg_list.index(dropped_argument_name)]
+                for dropped_argument_name in dropped_argument_names
             ]
+            if dropped_argument_names
+            else None
+        )
 
         if statistics_features:
             transformation_features = [
@@ -714,7 +748,8 @@ def from_response_json(
             return_types=output_types,
             name=function_name,
             transformation_features=transformation_features,
-            dropped_features=dropped_features,
+            dropped_argument_names=dropped_argument_names,
+            dropped_feature_names=dropped_feature_names,
             feature_name_prefix=feature_name_prefix,
         )
 
@@ -728,8 +763,6 @@ def _validate_udf_type(self):
         # Raises
             `hsfs.client.exceptions.FeatureStoreException` : If the UDF Type is None or if statistics or multiple columns has been output by a on-demand transformation function
         """
-        if self.udf_type is None:
-            raise FeatureStoreException("UDF Type cannot be None")
 
         if self._udf_type == UDFType.ON_DEMAND:
             if len(self.return_types) > 1:
@@ -843,7 +876,7 @@ def dropped_features(self) -> List[str]:
         """
         List of features that will be dropped after the UDF is applied.
         """
-        if self._feature_name_prefix:
+        if self._feature_name_prefix and self._dropped_features:
             return [
                 self._feature_name_prefix + dropped_feature
                 for dropped_feature in self._dropped_features
diff --git a/python/hsfs/transformation_function.py b/python/hsfs/transformation_function.py
index 65535aa539..fe30047384 100644
--- a/python/hsfs/transformation_function.py
+++ b/python/hsfs/transformation_function.py
@@ -241,5 +241,7 @@ def __repr__(self):
             return (
                 f"Model-Dependent Transformation Function : {repr(self.hopsworks_udf)}"
             )
-        else:
+        elif self.hopsworks_udf._udf_type == UDFType.ON_DEMAND:
             return f"On-Demand Transformation Function : {repr(self.hopsworks_udf)}"
+        else:
+            return f"Transformation Function : {repr(self.hopsworks_udf)}"
diff --git a/python/tests/engine/test_python_spark_transformation_functions.py b/python/tests/engine/test_python_spark_transformation_functions.py
index 71bb48cd05..8c29128641 100644
--- a/python/tests/engine/test_python_spark_transformation_functions.py
+++ b/python/tests/engine/test_python_spark_transformation_functions.py
@@ -161,7 +161,7 @@ def test_apply_builtin_minmax_from_backend(self, mocker):
             "transformationFeatures": [],
             "statisticsArgumentNames": ["feature"],
             "name": "min_max_scaler",
-            "droppedFeatures": ["feature"],
+            "droppedArgumentNames": ["feature"],
         }
 
         tf_fun = HopsworksUdf.from_response_json(udf_response)
@@ -304,7 +304,7 @@ def test_apply_builtin_standard_scaler_from_backend(self, mocker):
             "transformationFeatures": [],
             "statisticsArgumentNames": ["feature"],
             "name": "standard_scaler",
-            "droppedFeatures": ["feature"],
+            "droppedArgumentNames": ["feature"],
         }
 
         tf_fun = HopsworksUdf.from_response_json(udf_response)
@@ -451,7 +451,7 @@ def test_apply_builtin_robust_scaler_from_backend(self, mocker):
             "transformationFeatures": [],
             "statisticsArgumentNames": ["feature"],
             "name": "robust_scaler",
-            "droppedFeatures": ["feature"],
+            "droppedArgumentNames": ["feature"],
         }
 
         tf_fun = HopsworksUdf.from_response_json(udf_response)
diff --git a/python/tests/fixtures/feature_group_fixtures.json b/python/tests/fixtures/feature_group_fixtures.json
index c2394ed4cb..bc967508b0 100644
--- a/python/tests/fixtures/feature_group_fixtures.json
+++ b/python/tests/fixtures/feature_group_fixtures.json
@@ -695,7 +695,7 @@
             "name": "add_two",
             "outputTypes":["double"],
             "transformationFeatures":["data"],
-            "dropped_features":["data1"]
+            "dropped_argument_names":["data1"]
           }
         },
         {
@@ -707,7 +707,7 @@
             "name": "add_one_fs",
             "outputTypes":["double"],
             "transformationFeatures":["col1"],
-            "dropped_features":["data1"]
+            "dropped_argument_names":["data1"]
           }
         }
     ],
diff --git a/python/tests/fixtures/feature_view_fixtures.json b/python/tests/fixtures/feature_view_fixtures.json
index 1ad25dea36..260cffd0c9 100644
--- a/python/tests/fixtures/feature_view_fixtures.json
+++ b/python/tests/fixtures/feature_view_fixtures.json
@@ -935,7 +935,7 @@
                 "outputTypes":["double"],
                 "transformationFeatures":["data"],
                 "statisticsArgumentNames":["data1"],
-                "dropped_features":["data1"]
+                "dropped_argument_names":["data1"]
               }
             },
             {
@@ -947,7 +947,7 @@
                 "name": "add_one_fs",
                 "outputTypes":["double"],
                 "transformationFeatures":["col1"],
-                "dropped_features":["data1"]
+                "dropped_argument_names":["data1"]
               }
             }
         ],
diff --git a/python/tests/fixtures/training_dataset_feature_fixtures.json b/python/tests/fixtures/training_dataset_feature_fixtures.json
index 27cd07f302..0ca85653c8 100644
--- a/python/tests/fixtures/training_dataset_feature_fixtures.json
+++ b/python/tests/fixtures/training_dataset_feature_fixtures.json
@@ -79,7 +79,7 @@
             "name": "add_one_fs",
             "outputTypes":["double"],
             "transformationFeatures":["col1"],
-            "dropped_features":["data1"]
+            "dropped_argument_names":["data1"]
           }
         },
         "featuregroup": {
diff --git a/python/tests/fixtures/transformation_function_fixtures.json b/python/tests/fixtures/transformation_function_fixtures.json
index 036eb2fac7..2604d5d75e 100644
--- a/python/tests/fixtures/transformation_function_fixtures.json
+++ b/python/tests/fixtures/transformation_function_fixtures.json
@@ -9,7 +9,7 @@
         "name": "add_one_fs",
         "outputTypes":["double"],
         "transformationFeatures":["col1"],
-        "dropped_features":["data1"]
+        "dropped_argument_names":["data1"]
       }
     }
   },
@@ -24,7 +24,7 @@
         "outputTypes":["double"],
         "transformationFeatures":["data"],
         "statisticsArgumentNames":["data1"],
-        "dropped_features":["data1"]
+        "dropped_argument_names":["data1"]
       }
     }
   },
@@ -39,7 +39,7 @@
         "outputTypes":["string"],
         "transformationFeatures":["feature1", "feature2", "feature3"],
         "statisticsArgumentNames":["data1", "data2"],
-        "dropped_features":["data1", "data2", "data3"]
+        "dropped_argument_names":["data1", "data2", "data3"]
       }
     }
   },
@@ -54,7 +54,7 @@
         "outputTypes":["string", "double"],
         "transformationFeatures":["feature1", "feature2", "feature3"],
         "statisticsArgumentNames":["data1", "data2"],
-        "dropped_features":["data1", "data2", "data3"]
+        "dropped_argument_names":["data1", "data2", "data3"]
       }
     }
   },
@@ -72,7 +72,7 @@
             "outputTypes":["double"],
             "transformationFeatures":["data"],
             "statisticsArgumentNames":["data1"],
-            "dropped_features":["data1"]
+            "dropped_argument_names":["data1"]
           }
         },
         {
@@ -84,7 +84,7 @@
             "name": "add_one_fs",
             "outputTypes":["double"],
             "transformationFeatures":["col1"],
-            "dropped_features":["data1"]
+            "dropped_argument_names":["data1"]
           }
         }
       ]
@@ -104,7 +104,7 @@
             "outputTypes":["double"],
             "transformationFeatures":["data"],
             "statisticsArgumentNames":["data1"],
-            "dropped_features":["data1"]
+            "dropped_argument_names":["data1"]
           }
         }
       ]

From 4681f3302f064223dd05304ee37d80fb6b5269dc Mon Sep 17 00:00:00 2001
From: manu-sj <manu.joseph@logicalclocks.com>
Date: Tue, 9 Jul 2024 06:56:36 +0200
Subject: [PATCH 58/58] fixing unit tests

---
 python/tests/test_hopswork_udf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
index dfef840067..06ffb19742 100644
--- a/python/tests/test_hopswork_udf.py
+++ b/python/tests/test_hopswork_udf.py
@@ -746,6 +746,7 @@ def test_func(col1):
 
         with pytest.raises(FeatureStoreException) as exe:
             test_func._validate_udf_type()
+            test_func.get_udf()
 
         assert str(exe.value) == "UDF Type cannot be None"